A dynamic network approach to bilingual child data

About this document

This document contains the script used to analyze the data of two German-English bilingual children, “Fion” and “Silvie”, for the paper “A dynamic network approach to bilingual child data”. The method is loosely based on Ibbotson, Salnikov & Walker (2019). The data come from corpora described in more detail in e.g. Quick et al. (2018) and Koch, Endesfelder Quick & Hartmann (2025).

Data availability

The “Fion” data are already available on CHILDES. The “Silvie” data are currently in the process of being anonymized; we expect the data to become public in 2026.

Preliminaries

Loading some packages:

# load packages
library(tidyverse)
library(tidytext)
library(ngram)
library(igraph)
library(ggraph)
library(patchwork)
# library(svglite)
# library(geomnet)
# library(rgexf)
# library(qgraph)
# library(genBaRcode)
# library(ggraph)
library(ggiraph)
library(tidygraph)
library(scales)
#install.packages(
 # "microViz",
  #repos = c(davidbarnett = "https://david-barnett.r-universe.dev", getOption("repos"))
#)
#if (!require("BiocManager", quietly = TRUE))
 # install.packages("BiocManager")

#BiocManager::install("phyloseq")
# library(microViz)
# library(tidygraph)
# library(plotly)
library(classInt)
library(grid)
# devtools::install_github("analyxcompany/ForceAtlas2")
library(ForceAtlas2)
# library(networkD3)

Toy example

This is just a toy example for illustrating the basic ideas:

testdf <- tibble(text = c("I love linguistics", "I like linguistics", "I love networks"))

test_graph <- testdf %>% unnest_tokens(output = "bigrams", input = "text", token = "ngrams", n = 2) %>% separate_wider_delim(cols = "bigrams", delim = " ", names = c("word1", "word2")) %>% graph_from_data_frame()
E(test_graph)$weight <- c(2,1,1,1,2,1)
ggplot(layout) +
  geom_edge_link(aes(x = x, y = y, xend = xend, yend = yend,
                     edge_width = weight), color = "gray") +
  scale_edge_width(range = c(0.1, 0.7)) +
  geom_node_point(col="red") +
   geom_node_text(aes(label = name), position = position_nudge(y = .1)) +
   scale_color_identity() + scale_fill_identity() +
   theme_void() +
  theme(legend.position = "none")

# ggsave("images/ilovenetworks.png", width = 7, height = 7, bg = "white")

Data

CHI: Monolingual + mixed

# child data
d_fion <- read_csv("../../master/fion_CHI.csv")
d_silvie <- read_csv("../../master/silvie_CHI.csv")

# caregiver data
fion_cds <- read_csv("/Users/stefanhartmann/Library/CloudStorage/Dropbox/Input_Project/Data/master/fion_input_with_language_tags.csv.zip")
silvie_cds <- read_csv("/Users/stefanhartmann/Library/CloudStorage/Dropbox/Input_Project/Data/master/silvie_input_with_language_tags.csv.zip")

# remove all CDS data with "xxx" or only consisting of "<trs>" because the language tagging is often off.

fion_cds <- fion_cds[-grep("xxx", fion_cds$Utterance),]
silvie_cds <- silvie_cds[-grep("xxx", silvie_cds$Utterance),]

fion_cds <- fion_cds[-grep("<tr.*", fion_cds$Utterance),]
silvie_cds <- silvie_cds[-grep("<tr.*", silvie_cds$Utterance),]

Data wrangling

Child data

We only want to take multi-word units into account, hence we first filter them out:

# add wordcount
d_fion$wordcount <- sapply(1:nrow(d_fion), 
                           function(i) wordcount(d_fion$Utterance_clean[i]))

d_silvie$wordcount <- sapply(1:nrow(d_silvie), 
                             function(i) wordcount(d_silvie$Utterance_clean[i]))

# only multi-word units
mwu_fion <- filter(d_fion, wordcount > 1)
mwu_silvie <- filter(d_silvie, wordcount > 1)

Adding language tags: The column “Lang_Tags” already contains word-by-word tags for the code-mixed utterances, but not for the monolingual ones. However, the “type” column contains the information whether the utterance is English, German, or mixed. Hence for all non-mixed utterances, we can fill up the Lang_Tags column with this information. Before doing so, we add a few annotations that are missing in the files:

mwu_fion <- mwu_fion %>% mutate(Lang_Tags = case_when(
  Utterance_clean == "darf ich this nicht aufraeumen" ~ "g g e g g",
  Utterance_clean == "komm ich this nicht aufraeumen" ~ "g g e g g",
  Utterance_clean == "und this my one" ~ "g e e e",
  Utterance_clean == "nein no" ~ "g e",
  Utterance_clean == "you did birthday in juni" ~ "e e e eg g",
  Utterance_clean == "that istis ein birthday my nanny" ~ "e ge g e e e",
  Utterance_clean == "und das ist von bob the builder und wendy" ~ "g g g g e e e g eg",
  .default = Lang_Tags
))


mwu_silvie <- mwu_silvie %>% mutate(Lang_Tags = case_when(Utterance_clean == "und this noch" ~ "g e g",
                                            Utterance_clean == "der postman pat" ~ "g e e",
                                            Utterance_clean == "ja a game" ~ "g e e",
                                            Utterance_clean == "ja this big one" ~ "g e e e",
                                            Utterance_clean == "ja a starfish" ~ "g e e",
                                            Utterance_clean == "ja the ribbon" ~ "g e e",
                                            Utterance_clean == "das heisst naemlich train train train train" ~ "g g g e e e e",
                                            .default = Lang_Tags))

Also, there are some inconsistencies in the tagging that lead to more factor levels than necessary, we correct those:

mwu_fion$Lang_Tags <- gsub("e\\(meta\\)", "e", mwu_fion$Lang_Tags)
mwu_fion$Lang_Tags <- gsub("ge", "eg", mwu_fion$Lang_Tags)
mwu_fion$Lang_Tags <- gsub("m", "eg", mwu_fion$Lang_Tags)

mwu_silvie$Lang_Tags <- gsub("ge", "eg", mwu_silvie$Lang_Tags)
mwu_silvie$Lang_Tags <- gsub("m", "eg", mwu_silvie$Lang_Tags)

Now we can proceed:

# add language tags
mwu_fion$Lang_Tags <- gsub("[[:punct:]]", "", mwu_fion$Lang_Tags)
mwu_silvie$Lang_Tags <- gsub("[[:punct:]]", "", mwu_silvie$Lang_Tags)

# add language tags on a word-by-word-basis to the non-code-mixed utterances
mwu_fion$Lang_Tags <- sapply(1:nrow(mwu_fion), function(i) ifelse(is.na(mwu_fion$Lang_Tags[i]), ifelse(mwu_fion[i,]$type=="german", paste0(rep("g", mwu_fion[i,]$wordcount), collapse = " "), paste0(rep("e", mwu_fion[i,]$wordcount), collapse = " ")), mwu_fion$Lang_Tags[i]))
mwu_silvie$Lang_Tags <- sapply(1:nrow(mwu_silvie), function(i) ifelse(is.na(mwu_silvie$Lang_Tags[i]), ifelse(mwu_silvie[i,]$type=="german", paste0(rep("g", mwu_silvie[i,]$wordcount), collapse = " "), paste0(rep("e", mwu_silvie[i,]$wordcount), collapse = " ")), mwu_silvie$Lang_Tags[i]))

Get bigrams

Child data

As we are interested in transition probabilities between words, we need bigrams, which we get using the unnest_tokens function from the tidytext package. For each bigram, we also want the language information for the individual words, which is why we also extract bigrams from the Lang_Tags column in a second step and then join the dataframes. Finally, we split up the bigrams so that word1 and word2 are in different columns, which makes it easier to calculate the transition probabilities.

bigrams_fion <- mwu_fion %>% unnest_tokens(bigram, Utterance_clean, token = "ngrams", n = 2, drop = FALSE)
bigrams_silvie <- mwu_silvie %>% unnest_tokens(bigram, Utterance_clean, token = "ngrams", n = 2, drop = FALSE)


bigrams_fion <- bind_cols(bigrams_fion,
                          mwu_fion %>% unnest_tokens(bigram_LangTag, Lang_Tags, token = "ngrams", n = 2, drop = FALSE) %>% select(bigram_LangTag))


bigrams_silvie <- bind_cols(bigrams_silvie,
                            mwu_silvie %>% unnest_tokens(bigram_LangTag, Lang_Tags, token = "ngrams", n = 2, drop = FALSE) %>% select(bigram_LangTag))


# one column for each word
bigrams_fion <- bigrams_fion %>% separate(bigram, c("word1", "word2"), sep = " ", remove = F)
bigrams_silvie <- bigrams_silvie %>% separate(bigram, c("word1", "word2"), sep = " ", remove = F)

bigrams_fion <- bigrams_fion %>% separate(bigram_LangTag, c("LangTag1", "LangTag2"), sep = " ", remove = F)
bigrams_silvie <- bigrams_silvie %>% separate(bigram_LangTag, c("LangTag1", "LangTag2"), sep = " ", remove = F)


# add child column
bigrams_fion <- mutate(bigrams_fion, Child = "Fion")
bigrams_silvie <- mutate(bigrams_silvie, Child = "Silvie")

CDS data

As in the child data, we extract bigrams:

# get bigrams --------------------------------------------------

bigrams_cds_fion <- fion_cds %>% unnest_tokens(bigram, Utterance_clean, token = "ngrams", n = 2)
bigrams_cds_silvie <- silvie_cds %>% unnest_tokens(bigram, Utterance_clean, token = "ngrams", n = 2)


# one column for each word
bigrams_cds_fion <- bigrams_cds_fion %>% separate(bigram, c("word1", "word2"), sep = " ", remove = F)
bigrams_cds_silvie <- bigrams_cds_silvie %>% separate(bigram, c("word1", "word2"), sep = " ", remove = F)

Transition probabilities

As we are interested in the transition probabilities between words, we add those to the dataframes.

get_transition_probabilities <- function(df_bigrams, input_column = "Utterance_clean", age_range = "all", Speaker = "all") {
  
  # filter
  if(any(age_range!="all")) {
    #df_orig <- filter(df_orig, age_range %in% age_range)
    df_bigrams <- filter(df_bigrams, age_range %in% age_range)
  }
  
  if(any(Speaker!="all")) {
    #df_orig <- filter(df_orig, Speaker %in% Speaker)
    df_bigrams <- filter(df_bigrams, Speaker %in% Speaker)
  }
  
  # unigrams 
  unigrams <- df_bigrams %>% select(Utt_no, all_of(input_column)) %>% unique() %>% unnest_tokens(output = "unigram", input = input_column, token = "ngrams", n = 1)
# unigrams <- df_orig %>% unnest_tokens(output = "unigram", input = input_column, token = "ngrams", n = 1)

unigrams_tbl <- unigrams$unigram %>% table() %>% as.data.frame() %>% setNames(c("unigram", "Freq"))


# count co-occurrence frequencies of bigrams:
# first grouped by Utterance number so that
# the bigrams do not cross utterance boundaries,
# then summing up across utterances.
bigrams_df_tbl <- df_bigrams %>% group_by(Utt_no, word1, word2) %>% summarise(
  n = n()
) %>% na.omit() %>% group_by(word1, word2) %>%
  summarise(
    n = sum(n)
  ) 

# add unigram frequencies
bigrams_df_tbl <- left_join(bigrams_df_tbl, unigrams_tbl, by = c("word1" = "unigram")) %>% setNames(c("word1", "word2", "n", "n_word1"))

bigrams_df_tbl <- left_join(bigrams_df_tbl, unigrams_tbl, by = c("word2" = "unigram")) %>% setNames(c("word1", "word2", "n", "n_word1", "n_word2"))

# add backward and forward transitional probabilities
bigrams_df_tbl <- bigrams_df_tbl %>% mutate(ftp = n / n_word1,
                            btp = n / n_word2)

# return
return(bigrams_df_tbl)

}

Function for periodization

This function adds a `Months` column to the data in which several months can be binned into larger groups. This makes it easier to try out different periodization options (or to stick with the original months data, i.e. have one network per month).

To make an informed decision about the way the data are split up into time slices, let’s first take a quick look at the distribution of data across the timespan:

Child speech

# number of utterances
d_fion %>% group_by(Month) %>% summarise(
  n_utterances = n(),
  n_files = length(unique(Filename))
) %>% ggplot(aes(x=Month, y = n_utterances, label = n_files)) +
  geom_point() +
  geom_line(group = 1) +
  geom_text(position = position_stack(), vjust = -0.4) +
  theme(axis.text.x = element_text(angle=45, hjust=.9, size=12)) +
  ylab("Number of utterances") + 
  ggtitle("Fion (numbers indicate number of transcripts)") + d_silvie %>% group_by(Month) %>% summarise(
  n_utterances = n(),
  n_files = length(unique(Filename))
) %>% ggplot(aes(x=Month, y = n_utterances, label = n_files)) +
  geom_point() +
  geom_line(group = 1) +
  geom_text(position = position_stack(), vjust = -0.4) +
  theme(axis.text.x = element_text(angle=45, hjust=.9, size=12)) +
  ylab("Number of utterances") + 
  ggtitle("Silvie (numbers indicate number of transcripts)")

# number of words
d_fion %>% group_by(Month) %>% summarise(
  n_words = sum(Wordcount),
  n_files = length(unique(Filename))
) %>% ggplot(aes(x=Month, y = n_words, label = n_files)) +
  geom_point() +
  geom_line(group = 1) +
  geom_text(position = position_stack(), vjust = -0.4) +
  theme(axis.text.x = element_text(angle=45, hjust=.9, size=12)) +
  ylab("Number of words") + 
  ggtitle("Fion (numbers indicate number of transcripts)") + d_silvie %>% group_by(Month) %>% summarise(
  n_words = sum(Wordcount),
  n_files = length(unique(Filename))
) %>% ggplot(aes(x=Month, y = n_words, label = n_files)) +
  geom_point() +
  geom_line(group = 1) +
  geom_text(position = position_stack(), vjust = -0.4) +
  theme(axis.text.x = element_text(angle=45, hjust=.9, size=12)) +
  ylab("Number of words") + 
  ggtitle("Silvie (numbers indicate number of transcripts)")

In both datasets, we have fewer transcripts and, hence, fewer words in the later periods. Thus, it seems useful to work with three-month periods and samples of 450 utterances.

# function for getting equal-sized intervals

get_equal_bins <- function(x, n) {
  
  cur_var <- 1:length(unique(x))
  
  cur_breaks <- round(seq(1, length(unique(x)), by = length(unique(x)) / n))
  
  # add last number to breaks
  cur_breaks[length(cur_breaks)+1] <- cur_var[length(cur_var)]
  
  # add 0 to var and breaks so that we can calculate +1
  # below to avoid overlaps between two groups
  cur_var <- c(0, cur_var)
  cur_breaks[1] <- 0
  
 # cur_breaks[1] <- 0 # to make sure that it starts with 1

  cur_list <- lapply(1:(length(cur_breaks)-1), function(i) cur_var[((which(cur_var==cur_breaks[i])+1)):(which(cur_var==cur_breaks[1+i]))])
  
  return(cur_list)
}



# function for adding months column
add_month_bins <- function(df, n_bins) {
  
cur_bins <- get_equal_bins(df$Month, n_bins)

# list to dataframe
cur_bins_df <- do.call(rbind, lapply(1:length(cur_bins), function(i) tibble(index = i,
       no    = cur_bins[[i]])))

# get bin range
bins_tbl01 <- tibble(age = unique(df$Month),
                     no = 1:length(unique(df$Month)))

# join
cur_bins_df <- left_join(cur_bins_df, bins_tbl01)


# bins in character form (from a to b)
cur_bins_ch <- sapply(1:length(cur_bins), function(i) paste0(unique(df$Month)[cur_bins[[i]][1]], "-",
unique(df$Month)[cur_bins[[i]][length(cur_bins[[i]])]]))

# in tabular form
bins_tbl <- tibble(bin = 1:length(cur_bins),
       age_range = cur_bins_ch)

# add to existing dataframe
cur_bins_df <- left_join(cur_bins_df, bins_tbl, by = c("index" = "bin"))


# return Months column
cur_df_with_age_range <- left_join(df, select(cur_bins_df, age, age_range), by = c("Month" = "age"))

return(cur_df_with_age_range)

}

Child-directed speech

Now we turn to the child-directed speech data:

# count:
# first grouped by Utterance number so that
# the bigrams do not cross utterance boundaries,
# then summing up across utterances.
bigrams_fion_cds_count <- bigrams_cds_fion %>% group_by(Utt_no, word1, word2, lang) %>% summarise(
  n = n()
) %>% na.omit() %>% group_by(word1, word2, lang) %>%
  summarise(
    n = sum(n)
  ) %>% arrange(desc(n))

bigrams_silvie_cds_count <- bigrams_cds_silvie %>% group_by(Utt_no, word1, word2, lang) %>% summarise(
  n = n()
) %>% na.omit() %>% group_by(word1, word2, lang) %>%
  summarise(
    n = sum(n)
  ) %>% arrange(desc(n))

Getting periods and samples

We want to work with three-month periods, hence we divide the Fion data into 21/3 = 7 bins, the Silvie data into 18/3 = 6 bins. For the child-directed speech data, we do not expect as significant differences between the age spans, so we only work with ~six-month bins.

Periods - child data

# how many months for Fion and Silvie?
length(unique(d_fion$Month))
[1] 21
length(unique(d_silvie$Month))
[1] 18
# add bins
bigrams_fion <- bigrams_fion %>% add_month_bins(n_bins = 7)
bigrams_silvie <- bigrams_silvie %>% add_month_bins(n_bins = 6)
d_fion <- d_fion %>% add_month_bins(n_bins = 7)
d_silvie <- d_silvie %>% add_month_bins(n_bins = 6)

Periods - CDS data

# Get bins
bigrams_cds_fion <- bigrams_cds_fion %>% add_month_bins(n = 3)
bigrams_cds_silvie <- bigrams_cds_silvie %>% add_month_bins(n = 3)

fion_cds <- fion_cds %>% add_month_bins(n = 3)
silvie_cds <- silvie_cds %>% add_month_bins(n = 3)

Sampling - Child data

The following code compiles 450-utterance samples from three-month periods for the child data, and 45,000-utterance samples from the child-directed speech data for each ~6-month period.

# get samples
set.seed(1234567)

# Fion:
for(i in 1:7) {
  cur_fion <- filter(bigrams_fion, age_range == levels(factor(bigrams_fion$age_range))[i])
  set.seed(i)
  cur_samples <- sample(1:length(unique(cur_fion$Utt_no)), 450)
# cur_samples <- lapply(1:100, function(i) sample(1:length(unique(cur_fion$Utt_no)), 450)) # alternative with bootstrapping
assign(paste0("bigrams_fion0", i), cur_fion[unlist(cur_samples),])
}

# Silvie:
for(i in 1:6) {
  cur_silvie <- filter(bigrams_silvie, age_range == levels(factor(bigrams_silvie$age_range))[i])
  set.seed(i)
  cur_samples <- sample(1:length(unique(cur_silvie$Utt_no)), 450)
#cur_samples <- lapply(1:100, function(i) sample(1:length(unique(cur_silvie$Utt_no)), 450))
assign(paste0("bigrams_silvie0", i), cur_silvie[unlist(cur_samples),])
}

Sampling - CDS data

# backup
bigrams_cds_fion_with_unigrams <- bigrams_cds_fion
bigrams_cds_silvie_with_unigrams <- bigrams_cds_silvie

# omit utterances consisting of only one word
bigrams_cds_fion <- bigrams_cds_fion %>% filter(Wordcount > 1)
bigrams_cds_silvie <- bigrams_cds_silvie %>% filter(Wordcount > 1)


# get 25,000-utterance samples for each of the bins
set.seed(12345)
for(i in 1:3) {
  cur_bigrams <- bigrams_cds_fion %>% filter(age_range == levels(factor(bigrams_cds_fion$age_range))[i])
cur_spl <- sample(1:nrow(cur_bigrams), 45000)
assign(paste0("bigrams_cds_fion0", i), cur_bigrams[cur_spl,])

}

set.seed(12345)
for(i in 1:3) {
  cur_bigrams <- bigrams_cds_silvie %>% filter(age_range == levels(factor(bigrams_cds_silvie$age_range))[i])
cur_spl <- sample(1:nrow(cur_bigrams), 45000)
assign(paste0("bigrams_cds_silvie0", i), cur_bigrams[cur_spl,])

}

Getting networks

The following function calculates the actual networks by calculating transition probabilities, filtering out all instances attested less than n_min (default: 5) times.

Networks for child language

get_network <- function(bigram_df,n_min = 0, modularity_measure = FALSE, edges = "ftp") {
  # count the bigram_df
bigrams_count <- bigram_df %>% group_by(LangTag1, LangTag2) %>% count(word1, word2, sort = T)


# filter out all below n_min
l <- bigrams_count %>%
    filter(n >= n_min)

# add transitional probabilities
l1 <- left_join(l, select(get_transition_probabilities(bigram_df, input_column = "Utterance_clean"), -n), by = c("word1", "word2") )

# check if there are data
if(nrow(l1) > 0) {
    
  # get bigram graph edges and vertices
    bigram_graph <- l1 %>%
      filter(n >= n_min) %>%
      ungroup %>% select(word1, word2, all_of(edges)) %>% graph_from_data_frame(directed = FALSE)
    
    # set weight attributes
    if(edges == "ftp") {
      bigram_graph <- set_edge_attr(bigram_graph, "weight", value = l1$ftp)
    } else if(edges=="btp") {
      bigram_graph <- set_edge_attr(bigram_graph, "weight", value = l1$btp)
    } else if(edges=="n") {
      bigram_graph <- set_edge_attr(bigram_graph, "weight", value = l1$n)
    }
    
    
    # set labels
    V(bigram_graph)$label <- V(bigram_graph)$name
    
    # Louvain clustering
    lv <- cluster_louvain(bigram_graph, resolution = .3)
  
    # add Louvain clustering to graph
    V(bigram_graph)$community <- membership(lv)
    
    # add color for each community
    cur_palette <- viridis::cividis(n = max(V(bigram_graph)$community))
    cur_communities <- V(bigram_graph)$community
    V(bigram_graph)$community_color <- sapply(1:length(V(bigram_graph)$community), function(i)  cur_communities[i])
    
    
    # add language and word frequency as attributes
    # to do so, we need a list of unigrams
    
    # language tags of unigrams
    unigram_LangTags <-  bind_cols(bigram_df %>% select(Utt_no, Utterance_clean, Lang_Tags) %>% unique() %>% unnest_tokens(output = "unigram", input = "Utterance_clean", token = "ngrams", n = 1),
    
    select(bigram_df %>% select(Utt_no, Utterance_clean, Lang_Tags) %>% unique() %>% unnest_tokens(output = "unigram_LangTag", input = Lang_Tags, token = "ngrams", n = 1), unigram_LangTag)) %>% select(unigram, unigram_LangTag) %>% unique()
    
    # get frequencies of individual words
    unigrams_freqs <- bigram_df %>% select(Utt_no, Utterance_clean) %>% unique() %>% unnest_tokens(output = "unigram", input = Utterance_clean) %>% group_by(unigram) %>% summarise(
      n = n()
    )
    
    # add unigram LangTags as attributes to the graph
    
    V(bigram_graph)$language <- sapply(1:length(V(bigram_graph)), function(i) unigram_LangTags[which(unigram_LangTags$unigram == V(bigram_graph)$name[i]),]$unigram_LangTag[1])
    
    # add color as attribute
    V(bigram_graph)$color <- case_when(V(bigram_graph)$language == "g" ~ "salmon",
              V(bigram_graph)$language == "e" ~ "deepskyblue",
              V(bigram_graph)$language == "eg" ~ "tan",
              .default = "grey")
    
    
    # add frequency as attribute
    V(bigram_graph)$Freq <- sapply(1:length(V(bigram_graph)), function(i) unigrams_freqs[which(unigrams_freqs$unigram==V(bigram_graph)$name[i]),]$n[1]) 
    
    
    
    # return graph or modularity measure
    if(modularity_measure) {
      return(modularity(lv))
    } else {
      return(bigram_graph)
    }
    
    
  }
}

Networks for child-directed speech

# add missing columns (for compatibility with the above-defined functions)
bigrams_cds_fion$LangTag1 <- bigrams_cds_fion$lang
bigrams_cds_fion$LangTag2 <- bigrams_cds_fion$lang
bigrams_cds_fion$Lang_Tags <- bigrams_cds_fion$lang
bigrams_cds_fion$Utterance_clean <- bigrams_cds_fion$Utterance

bigrams_cds_silvie$LangTag1 <- bigrams_cds_silvie$lang
bigrams_cds_silvie$LangTag2 <- bigrams_cds_silvie$lang
bigrams_cds_silvie$Lang_Tags <- bigrams_cds_silvie$lang
bigrams_cds_silvie$Utterance_clean <- bigrams_cds_silvie$Utterance

bigrams_cds_fion01$LangTag1 <- bigrams_cds_fion01$lang
bigrams_cds_fion01$LangTag2 <- bigrams_cds_fion01$lang
bigrams_cds_fion01$Lang_Tags <- bigrams_cds_fion01$lang
bigrams_cds_fion01$Utterance_clean <- bigrams_cds_fion01$Utterance

bigrams_cds_silvie01$LangTag1 <- bigrams_cds_silvie01$lang
bigrams_cds_silvie01$LangTag2 <- bigrams_cds_silvie01$lang
bigrams_cds_silvie01$Lang_Tags <- bigrams_cds_silvie01$lang
bigrams_cds_silvie01$Utterance_clean <- bigrams_cds_silvie01$Utterance

bigrams_cds_fion02$LangTag1 <- bigrams_cds_fion02$lang
bigrams_cds_fion02$LangTag2 <- bigrams_cds_fion02$lang
bigrams_cds_fion02$Lang_Tags <- bigrams_cds_fion02$lang
bigrams_cds_fion02$Utterance_clean <- bigrams_cds_fion02$Utterance

bigrams_cds_silvie02$LangTag1 <- bigrams_cds_silvie02$lang
bigrams_cds_silvie02$LangTag2 <- bigrams_cds_silvie02$lang
bigrams_cds_silvie02$Lang_Tags <- bigrams_cds_silvie02$lang
bigrams_cds_silvie02$Utterance_clean <- bigrams_cds_silvie02$Utterance


bigrams_cds_fion03$LangTag1 <- bigrams_cds_fion03$lang
bigrams_cds_fion03$LangTag2 <- bigrams_cds_fion03$lang
bigrams_cds_fion03$Lang_Tags <- bigrams_cds_fion03$lang
bigrams_cds_fion03$Utterance_clean <- bigrams_cds_fion03$Utterance

bigrams_cds_silvie03$LangTag1 <- bigrams_cds_silvie03$lang
bigrams_cds_silvie03$LangTag2 <- bigrams_cds_silvie03$lang
bigrams_cds_silvie03$Lang_Tags <- bigrams_cds_silvie03$lang
bigrams_cds_silvie03$Utterance_clean <- bigrams_cds_silvie03$Utterance


# function for getting CDS networks
get_network_cds <- function(bigram_df, n_min = 0, modularity_measure = FALSE, edges = "ftp") {
  bigrams_count <- bigram_df %>% group_by(LangTag1, LangTag2) %>% count(word1, word2, sort = T)

# bigrams_count <- na.omit(bigrams_count)

# filter out all below n_min
l <- bigrams_count %>%
    filter(n >= n_min)

# add transitional probabilities
l1 <- left_join(l, select(get_transition_probabilities(bigram_df, input_column = "Utterance_clean"), -n), by = c("word1", "word2") )

# omit NAs (usually blank spaces)
l1 <- na.omit(l1)

# check if there are data
if(nrow(l1) > 0) {
    # get bigram graph edges and vertices
    bigram_graph <- l1 %>%
      filter(n >= n_min) %>%
      ungroup %>% select(word1, word2, all_of(edges)) %>% graph_from_data_frame(directed = FALSE)
    
    # set weight attributes
    if(edges == "ftp") {
          bigram_graph <- set_edge_attr(bigram_graph, "weight", value = l1$ftp)
    } else if (edges == "btp") {
          bigram_graph <- set_edge_attr(bigram_graph, "weight", value = l1$btp)
    } else if(edges == "n") {
          bigram_graph <- set_edge_attr(bigram_graph, "weight", value = l1$n)
    }
    
    # set labels
    V(bigram_graph)$label <- V(bigram_graph)$name
    
    # Louvain clustering
    lv <- cluster_louvain(bigram_graph, resolution = .3)
  
    # add Louvain clustering to graph
    V(bigram_graph)$community <- membership(lv)
    
    # add color for each community
    cur_palette <- viridis::cividis(n = max(V(bigram_graph)$community))
    cur_communities <- V(bigram_graph)$community
    V(bigram_graph)$community_color <- sapply(1:length(V(bigram_graph)$community), function(i)  cur_communities[i])
    
    
    # add language and word frequency as attributes
    # to do so, we need a list of unigrams
    
    # language tags of unigrams
    lang_tbl <- unique(select(bigram_df, word1, word2, lang)) %>% pivot_longer(cols = 1:2) %>% select(value, lang) %>% setNames(c("word", "language")) %>% unique()
   
     # add language tags to network
    V(bigram_graph)$language <-  sapply(1:length(V(bigram_graph)$name), function(i) lang_tbl[which(lang_tbl$word == V(bigram_graph)$name[i])[1],]$language)
      
     # get frequencies of individual words
    unigrams_freqs <- bigram_df %>% select(Utt_no, Utterance) %>% unique() %>% unnest_tokens(output = "unigram", input = Utterance) %>% group_by(unigram) %>% summarise(
      n = n()
    )
    
   
    # add color as attribute
    V(bigram_graph)$color <- case_when(V(bigram_graph)$language == "de" ~ "salmon",
              V(bigram_graph)$language == "en" ~ "deepskyblue",
              V(bigram_graph)$language == "mixed" ~ "tan",
              .default = "grey")
    
    
    # add frequency as attribute
    V(bigram_graph)$Freq <- sapply(1:length(V(bigram_graph)), function(i) unigrams_freqs[which(unigrams_freqs$unigram==V(bigram_graph)$name[i]),]$n[1]) 
}

# return graph or modularity measure
    if(modularity_measure) {
      return(modularity(lv))
    } else {
      return(bigram_graph)
    }

}

Visualization

The functions created above can now be combined to create networks for different age spans.

# function for plotting
get_plot <- function(cur_network, myseed = 1999, min_freq = 0, interactive = FALSE, repel = TRUE, max.overlaps = 20, communities = "all", input = "igraph", ellipse_alpha = 0.1, layout = "ForceAtlas2") {

if(input == "layout") {
  layout <- cur_network
} else {
  # get plot layout
  layout <- create_layout(cur_network, layout = "fr")
  
  

}
  
  if(any(layout == "fr")) {
    layout <- layout
  } else if(any(layout == "ForceAtlas2")) {
    set.seed(myseed)
    layout <- create_layout(test_graph, layout = "fr")
    layout[,1:2] <- layout.forceatlas2(test_graph, directed = TRUE, iterations = 1000, plotstep = 10) %>% as.data.frame() %>% setNames(c("x", "y"))
  
  
  
}
  


# allow for selecting individual communities  
if(any(communities != "all")) {
  layout <- filter(layout, community %in% communities)
}
  
# Build plot
set.seed(myseed)
p <- ggplot(layout) +
  geom_edge_link(aes(x = x, y = y, xend = xend, yend = yend,
                     edge_width = weight, alpha = weight), color = "gray") +
  scale_edge_width(range = c(0.1, 0.5)) +
  geom_point_interactive(
    aes(x = x, y = y, tooltip = name, color = color, size = Freq)
  ) +
   geom_node_text(aes(label = ifelse(Freq > min_freq, name, ""), size = Freq, color = community_color), repel = repel, max.overlaps = max.overlaps) +
  stat_ellipse(aes(x=x, y=y, group = as.factor(community), fill = community_color),
               geom = "polygon", alpha = ellipse_alpha, color = NA) +
   scale_color_identity() + scale_fill_identity() +
   theme_void() +
  theme(legend.position = "none")

if(interactive) {
  # Zoomable plot with girafe
g <- girafe(
  ggobj = p,
  options = list(
    opts_zoom(min = 1, max = 60),
    opts_toolbar(saveaspng = TRUE)
  )
)

return(g)

} else {
  return(p)
}
  

}

Plots for publication

Child speech

Overall networks
# create seven objects with Fion's networks
for(i in 1:7) {
  assign(paste0("network_fion0", i), get(paste0("bigrams_fion0", i)) %>% get_network())
}

# create six objects with Silvie's networks
for(i in 1:6) {
  assign(paste0("network_silvie0", i), get(paste0("bigrams_silvie0", i)) %>% get_network())
}

# create seven plot objects named p_f1 to p_f7 for Fion's networks
for(i in 1:7) {
  assign(paste0("p_f", i), get(paste0("network_fion0", i)) %>% get_plot(repel = TRUE, max.overlaps = 50, min_freq = 5) + ggtitle(levels(factor(bigrams_fion$age_range))[i]) + theme(plot.title = element_text(face = "bold", hjust = 0.5)))
}

# create six plot objects named p_f1 to p_f7 for Silvie's networks
for(i in 1:6) {
  assign(paste0("p_s", i), get(paste0("network_silvie0", i)) %>%  get_plot(repel = TRUE, min_freq = 5, max.overlaps = 50) + ggtitle(levels(factor(bigrams_silvie$age_range))[i]) + theme(plot.title = element_text(face = "bold", hjust = 0.5)))
}

# combine all in one big plot

# titles:
p_t1 <- ggplot() + annotate("text", x = 4, y = 25, label = "bold(paste('Bigram', ' network, Fion'))", size = 12, parse = TRUE) + theme_void() 
p_t2 <- ggplot() + annotate("text", x = 4, y = 25, label = "bold(paste('Bigram', ' network, Silvie'))", size = 12, parse = TRUE) + theme_void() 

# composite plot:
p_t1 /
(p_f1 | p_f2 | p_f3) /
(p_f4 | p_f5 | p_f6) /
p_f7 /
p_t2 /
(p_s1 | p_s2 | p_s3) /
(p_s4 | p_s5 | p_s6) +
  plot_layout(heights = c(.2,1,1,1,.2,1,1))

 ggsave("images/fion_silvie_networks450.png", width = 15, height = 20.5)
Largest communities
# get largest communities - Fion

for(i in 1:7) {
  cur_network <- get_network(get(paste0("bigrams_fion0", i)), edges = "n")
  cur_network_layout <- create_layout(cur_network, layout = "fr")
  
  largest_communities <- cur_network_layout %>% group_by(community) %>% summarise(
  n = n()
) %>% arrange(desc(n)) %>% head(4) %>% select(community) %>% as.vector() %>% unname() %>% unlist()
  
  prop_top_communities <- table(cur_network_layout$community) %>% prop.table() %>% as_tibble(.name_repair="unique") %>% setNames(c("community", "n")) %>% arrange(desc(n)) %>% head(4)
  
  
  p0001 <- cur_network %>% get_plot(communities=largest_communities[1]) + ggtitle(paste0("Community ", 1, " (", paste0(round(prop_top_communities$n[1], digits = 4)*100, "%"), ")"))
  p0002 <- cur_network %>% get_plot(communities=largest_communities[2]) + ggtitle(paste0("Community ", 2, " (", paste0(round(prop_top_communities$n[2], digits = 4)*100, "%"), ")"))
  p0003 <- cur_network %>% get_plot(communities=largest_communities[3]) + ggtitle(paste0("Community ", 3, " (", paste0(round(prop_top_communities$n[3], digits = 4)*100, "%"), ")"))
  p0004 <- cur_network %>% get_plot(communities=largest_communities[4]) + ggtitle(paste0("Community ", 4, " (", paste0(round(prop_top_communities$n[4], digits = 4)*100, "%"), ")"))
  
  cur_p <- (p0001 | p0002 | p0003 | p0004)
  assign(paste0("p_lc_f_period", i), cur_p)

}


# get largest communities - Silvie

for(i in 1:6) {
  cur_network <- get_network(get(paste0("bigrams_silvie0", i)), edges = "n")
  cur_network_layout <- create_layout(cur_network, layout = "fr")
  
  largest_communities <- cur_network_layout %>% group_by(community) %>% summarise(
  n = n()
) %>% arrange(desc(n)) %>% head(4) %>% select(community) %>% as.vector() %>% unname() %>% unlist()
  
  prop_top_communities <- table(cur_network_layout$community) %>% prop.table() %>% as_tibble(.name_repair="unique") %>% setNames(c("community", "n")) %>% arrange(desc(n)) %>% head(4)
  
  
  p0001 <- cur_network %>% get_plot(communities=largest_communities[1]) + ggtitle(paste0("Community ", 1, " (", paste0(round(prop_top_communities$n[1], digits = 4)*100, "%"), ")"))
  p0002 <- cur_network %>% get_plot(communities=largest_communities[2]) + ggtitle(paste0("Community ", 2, " (", paste0(round(prop_top_communities$n[2], digits = 4)*100, "%"), ")"))
  p0003 <- cur_network %>% get_plot(communities=largest_communities[3]) + ggtitle(paste0("Community ", 3, " (", paste0(round(prop_top_communities$n[3], digits = 4)*100, "%"), ")"))
  p0004 <- cur_network %>% get_plot(communities=largest_communities[4]) + ggtitle(paste0("Community ", 4, " (", paste0(round(prop_top_communities$n[4], digits = 4)*100, "%"), ")"))
  
  cur_p <- (p0001 | p0002 | p0003 | p0004)
  assign(paste0("p_lc_s_period", i), cur_p)

}


# plot titles
p_t1 <- ggplot() + annotate("text", x = 4, y = 25, label = "bold(paste('Largest', ' communities, Fion'))", size = 12, parse = TRUE) + theme_void() 

for(i in 1:7) {
  assign(paste0("title_agerange_fion", i), ggplot() + annotate("text", x = 4, y = 25, label = levels(factor(bigrams_fion$age_range))[i], size = 8, parse = FALSE) + theme_void())
}

p_t2 <- ggplot() + annotate("text", x = 4, y = 25, label = "bold(paste('Largest', ' communities, Silvie'))", size = 12, parse = TRUE) + theme_void() 

for(i in 1:6) {
  assign(paste0("title_agerange_silvie", i), ggplot() + annotate("text", x = 4, y = 25, label = levels(factor(bigrams_fion$age_range))[i], size = 8, parse = FALSE) + theme_void())
}


# complete plot
p_t1 /
  title_agerange_fion1 /
  p_lc_f_period1 /
  title_agerange_fion2 /
  p_lc_f_period2 /
  title_agerange_fion3 /
  p_lc_f_period3 /
  title_agerange_fion4 /
  p_lc_f_period4 /
  title_agerange_fion5 /
  p_lc_f_period5 /  
  title_agerange_fion6 /
  p_lc_f_period6 /
  title_agerange_fion7 /
  p_lc_f_period7 + plot_layout(heights = c(.2, rep(c(.1,1), 7)))

# ggsave("images/largest_communities_fion_new.png", width = 16, height = 25)


# plot titles
p_t1 <- ggplot() + annotate("text", x = 4, y = 25, label = "bold(paste('Largest', ' communities, Silvie'))", size = 12, parse = TRUE) + theme_void() 

for(i in 1:7) {
  assign(paste0("title_agerange_silvie", i), ggplot() + annotate("text", x = 4, y = 25, label = levels(factor(bigrams_silvie$age_range))[i], size = 8, parse = FALSE) + theme_void())
}

p_t2 <- ggplot() + annotate("text", x = 4, y = 25, label = "bold(paste('Largest', ' communities, Silvie'))", size = 12, parse = TRUE) + theme_void() 

for(i in 1:6) {
  assign(paste0("title_agerange_silvie", i), ggplot() + annotate("text", x = 4, y = 25, label = levels(factor(bigrams_silvie$age_range))[i], size = 8, parse = FALSE) + theme_void())
}


# complete plot
p_t2 /
  title_agerange_silvie1 /
  p_lc_s_period1 /
  title_agerange_silvie2 /
  p_lc_s_period2 /
  title_agerange_silvie3 /
  p_lc_s_period3 /
  title_agerange_silvie4 /
  p_lc_s_period4 /
  title_agerange_silvie5 /
  p_lc_s_period5 /  
  title_agerange_silvie6 /
  p_lc_s_period6 + plot_layout(heights = c(.2, rep(c(.1,1), 6)))

ggsave("images/largest_communities_silvie_new.png", width = 16, height = 25)

Child-directed speech

Overall networks
# networs for each of the three time slots, separately for mothers and fathers

# Silvie - mother
silvie_MOT01 <- bigrams_cds_silvie01 %>% filter(Speaker == "MOT") %>% get_network_cds()
silvie_MOT02 <- bigrams_cds_silvie02 %>% filter(Speaker == "MOT") %>% get_network_cds()
silvie_MOT03 <- bigrams_cds_silvie03 %>% filter(Speaker == "MOT") %>% get_network_cds()

# Silvie - father
silvie_FAT01 <- bigrams_cds_silvie01 %>% filter(Speaker == "FAT") %>% get_network_cds()
silvie_FAT02<- bigrams_cds_silvie02 %>% filter(Speaker == "FAT") %>% get_network_cds()
silvie_FAT03 <- bigrams_cds_silvie03 %>% filter(Speaker == "FAT") %>% get_network_cds()

# Fion - mother
fion_MOT01 <- bigrams_cds_fion01 %>% filter(Speaker == "MOT") %>% get_network_cds()
fion_MOT02 <- bigrams_cds_fion02 %>% filter(Speaker == "MOT") %>% get_network_cds()
fion_MOT03 <- bigrams_cds_fion03 %>% filter(Speaker == "MOT") %>% get_network_cds()

# Fion - father
fion_FAT01 <- bigrams_cds_fion01 %>% filter(Speaker == "FAT") %>% get_network_cds()
fion_FAT02<- bigrams_cds_fion02 %>% filter(Speaker == "FAT") %>% get_network_cds()
fion_FAT03 <- bigrams_cds_fion03 %>% filter(Speaker == "FAT") %>% get_network_cds()

We don’t include the images here to keep the size of the file a bit lower, but the png files can be found in the “images” folder (`images/networks_cds.png`).

# plot networks

# titles
p_t1 <- ggplot() + annotate("text", x = 4, y = 25, label = "bold(paste('Bigram', ' networks, Fion', '\\'s', ' mother'))", size = 16, parse = TRUE) + theme_void() 
p_t2 <- ggplot() + annotate("text", x = 4, y = 25, label = "bold(paste('Bigram', ' networks, Fion', '\\'s', ' father'))", size = 16, parse = TRUE) + theme_void() 
p_t3 <- ggplot() + annotate("text", x = 4, y = 25, label = "bold(paste('Bigram', ' networks, Silvie', '\\'s', ' mother'))", size = 16, parse = TRUE) + theme_void() 
p_t4 <- ggplot() + annotate("text", x = 4, y = 25, label = "bold(paste('Bigram', ' networks, Silvie', '\\'s', ' father'))", size = 16, parse = TRUE) + theme_void() 


p_t1 /
(get_plot(fion_MOT01, repel = F, min_freq =3, ellipse_alpha = .001) |
get_plot(fion_MOT02, repel = F, min_freq =3, ellipse_alpha = .001) |
get_plot(fion_MOT03, repel = F, min_freq =3, ellipse_alpha = .001) ) /
p_t2 /
(get_plot(fion_FAT01, repel = F, min_freq =3, ellipse_alpha = .001) |
get_plot(fion_FAT02, repel = F, min_freq =3, ellipse_alpha = .001) |
get_plot(fion_FAT03, repel = F, min_freq =3, ellipse_alpha = .001) ) /
p_t3 /
(get_plot(silvie_FAT01, repel = F, min_freq =3, ellipse_alpha = .001) |
get_plot(silvie_FAT02, repel = F, min_freq =3, ellipse_alpha = .001) |
get_plot(silvie_FAT03, repel = F, min_freq =3, ellipse_alpha = .001) ) /
p_t4 /
(get_plot(silvie_MOT01, repel = F, min_freq =3, ellipse_alpha = .001) |
get_plot(silvie_MOT02, repel = F, min_freq =3, ellipse_alpha = .001) |
get_plot(silvie_MOT03, repel = F, min_freq =3, ellipse_alpha = .001)) +
  plot_layout(heights = c(.1, 1, .1, 1, .1, 1, .1, 1))

# ggsave("images/networks_cds.png", width = 20, height = 30)
Largest communities
# networks spanning the entire time period
fion_FAT <- bigrams_cds_fion %>% filter(Speaker == "FAT") %>% get_network_cds()
fion_MOT <- bigrams_cds_fion %>% filter(Speaker == "MOT") %>% get_network_cds()
silvie_FAT <- bigrams_cds_silvie %>% filter(Speaker == "FAT") %>% get_network_cds()
silvie_MOT <- bigrams_cds_silvie %>% filter(Speaker == "MOT") %>% get_network_cds()


# get largest communities

# Silvie's mother
silvie_MOT_network <- silvie_MOT %>% create_layout(layout = "fr") 
largest_clusters <- silvie_MOT_network %>% select(community) %>% table() %>% sort(decreasing = T) %>% head(4) %>% enframe() %>% select(name) %>% as_vector() %>% unname()
largest_clusters_prop <- silvie_MOT_network %>% select(community) %>% table() %>% sort(decreasing = T) %>% prop.table() %>% head(4) %>% round(digits = 4)

# plot four largest communities
silvie_MOT_lc <-  ( silvie_MOT_network %>% filter(community %in% largest_clusters[1]) %>% get_plot(input = "layout") + ggtitle(paste0("Community 1 (", largest_clusters_prop[1]*100, "%)")) |
silvie_MOT_network %>% filter(community %in% largest_clusters[2]) %>% get_plot(input = "layout") +  ggtitle(paste0("Community 2 (", largest_clusters_prop[2]*100, "%)")) |
silvie_MOT_network %>% filter(community %in% largest_clusters[3]) %>% get_plot(input = "layout") + ggtitle(paste0("Community 3 (", largest_clusters_prop[3]*100, "%)")) |
    silvie_MOT_network %>% filter(community %in% largest_clusters[4]) %>% get_plot(input = "layout") + ggtitle(paste0("Community 4 (", largest_clusters_prop[4]*100, "%)")))

# Silvie's father
silvie_FAT_network <- silvie_FAT %>% create_layout(layout = "fr") 
largest_clusters <- silvie_FAT_network %>% select(community) %>% table() %>% sort(decreasing = T) %>% head(4) %>% enframe() %>% select(name) %>% as_vector() %>% unname()
largest_clusters_prop <- silvie_FAT_network %>% select(community) %>% table() %>% sort(decreasing = T) %>% prop.table() %>% head(4) %>% round(digits = 4)

# plot four largest communities
silvie_FAT_lc <-  ( silvie_FAT_network %>% filter(community %in% largest_clusters[1]) %>% get_plot(input = "layout") + ggtitle(paste0("Community 1 (", largest_clusters_prop[1]*100, "%)")) |
silvie_FAT_network %>% filter(community %in% largest_clusters[2]) %>% get_plot(input = "layout") +  ggtitle(paste0("Community 2 (", largest_clusters_prop[2]*100, "%)")) |
silvie_FAT_network %>% filter(community %in% largest_clusters[3]) %>% get_plot(input = "layout") + ggtitle(paste0("Community 3 (", largest_clusters_prop[3]*100, "%)")) |
    silvie_FAT_network %>% filter(community %in% largest_clusters[4]) %>% get_plot(input = "layout") + ggtitle(paste0("Community 4 (", largest_clusters_prop[4]*100, "%)")))

# Fion's mother

fion_MOT_network <- fion_MOT %>% create_layout(layout = "fr") 
largest_clusters <- fion_MOT_network %>% select(community) %>% table() %>% sort(decreasing = T) %>% head(4) %>% enframe() %>% select(name) %>% as_vector() %>% unname()
largest_clusters_prop <- fion_MOT_network %>% select(community) %>% table() %>% sort(decreasing = T) %>% prop.table() %>% head(4) %>% round(digits = 4)

# plot four largest communities
fion_MOT_lc <-  ( fion_MOT_network %>% filter(community %in% largest_clusters[1]) %>% get_plot(input = "layout") + ggtitle(paste0("Community 1 (", largest_clusters_prop[1]*100, "%)")) |
fion_MOT_network %>% filter(community %in% largest_clusters[2]) %>% get_plot(input = "layout") +  ggtitle(paste0("Community 2 (", largest_clusters_prop[2]*100, "%)")) |
fion_MOT_network %>% filter(community %in% largest_clusters[3]) %>% get_plot(input = "layout") + ggtitle(paste0("Community 3 (", largest_clusters_prop[3]*100, "%)")) |
    fion_MOT_network %>% filter(community %in% largest_clusters[4]) %>% get_plot(input = "layout") + ggtitle(paste0("Community 4 (", largest_clusters_prop[4]*100, "%)")))

# fion's father
fion_FAT_network <- fion_FAT %>% create_layout(layout = "fr") 
largest_clusters <- fion_FAT_network %>% select(community) %>% table() %>% sort(decreasing = T) %>% head(4) %>% enframe() %>% select(name) %>% as_vector() %>% unname()
largest_clusters_prop <- fion_FAT_network %>% select(community) %>% table() %>% sort(decreasing = T) %>% prop.table() %>% head(4) %>% round(digits = 4)

# plot four largest communities
fion_FAT_lc <-  ( fion_FAT_network %>% filter(community %in% largest_clusters[1]) %>% get_plot(input = "layout") + ggtitle(paste0("Community 1 (", largest_clusters_prop[1]*100, "%)")) |
fion_FAT_network %>% filter(community %in% largest_clusters[2]) %>% get_plot(input = "layout") +  ggtitle(paste0("Community 2 (", largest_clusters_prop[2]*100, "%)")) |
fion_FAT_network %>% filter(community %in% largest_clusters[3]) %>% get_plot(input = "layout") + ggtitle(paste0("Community 3 (", largest_clusters_prop[3]*100, "%)")) |
    fion_FAT_network %>% filter(community %in% largest_clusters[4]) %>% get_plot(input = "layout") + ggtitle(paste0("Community 4 (", largest_clusters_prop[4]*100, "%)")))


# all in one big plot
title_fion_MOT <- ggplot() + annotate("text", x = 4, y = 25, label = "bold(paste('Fion', ' MOT'))", size = 12, parse = TRUE) + theme_void() 
title_fion_FAT <- ggplot() + annotate("text", x = 4, y = 25, label = "bold(paste('Fion', ' FAT'))", size = 12, parse = TRUE) + theme_void() 
title_silvie_MOT <- ggplot() + annotate("text", x = 4, y = 25, label = "bold(paste('Silvie', ' MOT'))", size = 12, parse = TRUE) + theme_void() 
title_silvie_FAT <- ggplot() + annotate("text", x = 4, y = 25, label = "bold(paste('Silvie', ' FAT'))", size = 12, parse = TRUE) + theme_void() 

Plots for online viewing

Here we provide plots showing the full networks for each time slice for online viewing.

Child speech

# Fion

bigrams_fion01 %>% get_network() %>% get_plot(repel = FALSE, interactive = TRUE)
bigrams_fion02 %>% get_network() %>% get_plot(repel = FALSE,interactive = TRUE)
bigrams_fion03 %>% get_network() %>% get_plot(repel = FALSE,interactive = TRUE)
bigrams_fion04 %>% get_network() %>% get_plot(repel = FALSE,interactive = TRUE)
bigrams_fion05 %>% get_network() %>% get_plot(repel = FALSE,interactive = TRUE)
bigrams_fion06 %>% get_network() %>% get_plot(repel = FALSE,interactive = TRUE)
bigrams_fion07 %>% get_network() %>% get_plot(repel = FALSE,interactive = TRUE)
# Silvie

bigrams_silvie01 %>% get_network() %>% get_plot(repel = FALSE, interactive = TRUE)
bigrams_silvie02 %>% get_network() %>% get_plot(repel = FALSE,interactive = TRUE)
bigrams_silvie03 %>% get_network() %>% get_plot(repel = FALSE,interactive = TRUE)
bigrams_silvie04 %>% get_network() %>% get_plot(repel = FALSE,interactive = TRUE)
bigrams_silvie05 %>% get_network() %>% get_plot(repel = FALSE,interactive = TRUE)
bigrams_silvie06 %>% get_network() %>% get_plot(repel = FALSE,interactive = TRUE)

Child-directed speech

Network size

Network size refers to the number of nodes in the network. We calculate it for each child in each month.

Child speech

network_sizes <- bind_rows(
  tibble(
  Child = "Fion",
  Language = "German",
  Size = sapply(1:length(levels(factor(bigrams_fion$Month))), function(i) length(V(bigrams_fion %>% filter(Month == levels(factor(bigrams_fion$Month))[i] & type == "german") %>% select(word1, word2) %>% graph_from_data_frame()))),
  Month = levels(factor(bigrams_fion$Month))
),

tibble(
  Child = "Fion",
  Language = "English",
  Size = sapply(1:length(levels(factor(bigrams_fion$Month))), function(i) length(V(bigrams_fion %>% filter(Month == levels(factor(bigrams_fion$Month))[i] & type == "english") %>% select(word1, word2) %>% graph_from_data_frame()))),
  Month = levels(factor(bigrams_fion$Month))
),

tibble(
  Child = "Fion",
  Language = "mixed",
  Size = sapply(1:length(levels(factor(bigrams_fion$Month))), function(i) length(V(bigrams_fion %>% filter(Month == levels(factor(bigrams_fion$Month))[i] & type == "mixed") %>% select(word1, word2) %>% graph_from_data_frame()))),
  Month = levels(factor(bigrams_fion$Month))
),

tibble(
  Child = "Fion",
  Language = "all",
  Size = sapply(1:length(levels(factor(bigrams_fion$Month))), function(i) length(V(bigrams_fion %>% filter(Month == levels(factor(bigrams_fion$Month))[i]) %>% select(word1, word2) %>% graph_from_data_frame()))),
  Month = levels(factor(bigrams_fion$Month))
),

tibble(
  Child = "Silvie",
  Language = "German",
  Size = sapply(1:length(levels(factor(bigrams_silvie$Month))), function(i) length(V(bigrams_silvie %>% filter(Month == levels(factor(bigrams_silvie$Month))[i] & type == "german") %>% select(word1, word2) %>% graph_from_data_frame()))),
  Month = levels(factor(bigrams_silvie$Month))
),

tibble(
  Child = "Silvie",
  Language = "English",
  Size = sapply(1:length(levels(factor(bigrams_silvie$Month))), function(i) length(V(bigrams_silvie %>% filter(Month == levels(factor(bigrams_silvie$Month))[i] & type == "english") %>% select(word1, word2) %>% graph_from_data_frame()))),
  Month = levels(factor(bigrams_silvie$Month))
),

tibble(
  Child = "Silvie",
  Language = "mixed",
  Size = sapply(1:length(levels(factor(bigrams_silvie$Month))), function(i) length(V(bigrams_silvie %>% filter(Month == levels(factor(bigrams_silvie$Month))[i] & type == "mixed") %>% select(word1, word2) %>% graph_from_data_frame()))),
  Month = levels(factor(bigrams_silvie$Month))
),

tibble(
  Child = "Silvie",
  Language = "all",
  Size = sapply(1:length(levels(factor(bigrams_silvie$Month))), function(i) length(V(bigrams_silvie %>% filter(Month == levels(factor(bigrams_silvie$Month))[i]) %>% select(word1, word2) %>% graph_from_data_frame()))),
  Month = levels(factor(bigrams_silvie$Month))
)
)

# add columns for letting the "all" line appear differently
network_sizes$alpha <- ifelse(network_sizes$Language=="all", 0.3, 1)
network_sizes$linetype <- ifelse(network_sizes$Language == "all", 2, 1)
network_sizes$linewidth <- ifelse(network_sizes$Language == "all", 2.5, .8)

ggplot(network_sizes, aes(x = Month, y = Size, group = Language, col = Language, linewidth = linewidth, linetype = linetype, alpha = alpha)) + geom_point() + geom_line() + facet_wrap(~Child) +  theme(axis.text.x = element_text(angle=45, hjust=.9, size=12)) +
  scale_color_manual(values = c("darkgreen", "deepskyblue", "salmon", "tan")) + scale_alpha_identity() + scale_linetype_identity() + scale_linewidth_identity() +
  theme(axis.text = element_text(size = 18)) +
  theme(axis.title = element_text(size = 18)) +
  theme(strip.text = element_text(size = 18)) +
  theme(legend.text = element_text(size = 18)) +
  theme(legend.title = element_text(size = 18, face = "bold")) +
  theme(text = element_text(size = 18))

# network sizes for the sampled data

network_sizes_sampled <- bind_rows(
  tibble(
  Child = "Fion",
  Language = "German",
  Period = levels(factor(bigrams_fion$age_range)),
  Size = sapply(1:7, function(i) length(V(get(paste0("bigrams_fion0", i)) %>% filter(type == "german") %>% select(word1, word2) %>% graph_from_data_frame())))
),

tibble(
  Child = "Fion",
  Language = "English",
  Period = levels(factor(bigrams_fion$age_range)),
  Size = sapply(1:7, function(i) length(V(get(paste0("bigrams_fion0", i)) %>% filter(type == "english") %>% select(word1, word2) %>% graph_from_data_frame())))
),

tibble(
  Child = "Fion",
  Language = "mixed",
  Period = levels(factor(bigrams_fion$age_range)),
  Size = sapply(1:7, function(i) length(V(get(paste0("bigrams_fion0", i)) %>% filter(type == "mixed") %>% select(word1, word2) %>% graph_from_data_frame())))
),

tibble(
  Child = "Fion",
  Language = "all",
  Period = levels(factor(bigrams_fion$age_range)),
  Size = sapply(1:7, function(i) length(V(get(paste0("bigrams_fion0", i)) %>% select(word1, word2) %>% graph_from_data_frame())))
),

tibble(
  Child = "Silvie",
  Language = "German",
  Period = levels(factor(bigrams_silvie$age_range)),
  Size = sapply(1:6, function(i) length(V(get(paste0("bigrams_silvie0", i)) %>% filter(type == "german") %>% select(word1, word2) %>% graph_from_data_frame())))
),

tibble(
  Child = "Silvie",
  Language = "English",
  Period = levels(factor(bigrams_silvie$age_range)),
  Size = sapply(1:6, function(i) length(V(get(paste0("bigrams_silvie0", i)) %>% filter(type == "english") %>% select(word1, word2) %>% graph_from_data_frame())))
),

tibble(
  Child = "Silvie",
  Language = "mixed",
  Period = levels(factor(bigrams_silvie$age_range)),
  Size = sapply(1:6, function(i) length(V(get(paste0("bigrams_silvie0", i)) %>% filter(type == "mixed") %>% select(word1, word2) %>% graph_from_data_frame())))
),

tibble(
  Child = "Silvie",
  Language = "all",
  Period = levels(factor(bigrams_silvie$age_range)),
  Size = sapply(1:6, function(i) length(V(get(paste0("bigrams_silvie0", i)) %>% select(word1, word2) %>% graph_from_data_frame())))
)
)



# add columns for letting the "all" line appear differently
network_sizes_sampled$alpha <- ifelse(network_sizes_sampled$Language=="all", 0.3, 1)
network_sizes_sampled$linetype <- ifelse(network_sizes_sampled$Language == "all", 2, 1)
network_sizes_sampled$linewidth <- ifelse(network_sizes_sampled$Language == "all", 2.5, .8)


ggplot(network_sizes_sampled, aes(x = Period, y = Size, group = Language, col = Language, linewidth = linewidth, linetype = linetype, alpha = alpha)) + geom_point() + geom_line() + facet_wrap(~Child, scales = "free_x") +  theme(axis.text.x = element_text(angle=45, hjust=.9, size=12)) +
  scale_color_manual(values = c("darkgreen", "deepskyblue", "salmon", "tan")) + scale_alpha_identity() + scale_linetype_identity() + scale_linewidth_identity() +
  theme(axis.text = element_text(size = 18)) +
  theme(axis.title = element_text(size = 18)) +
  theme(strip.text = element_text(size = 18)) +
  theme(legend.text = element_text(size = 18)) +
  theme(legend.title = element_text(size = 18, face = "bold")) +
  theme(text = element_text(size = 18))

# ggsave("images/network_size.png", width = 12, height = 6) 

Child-directed speech

network_sizes_cds <- bind_rows(
  tibble(
  Speaker = "FAT",
  Child = "Silvie",
  Language = "English",
  Period = levels(factor(bigrams_cds_silvie$age_range)),
  `Number of nodes` = sapply(1:3, function(i) length(V(get(paste0("bigrams_cds_silvie0", i)) %>% filter(Speaker=="FAT" & lang == "en") %>% get_network_cds()))),
  `Number of edges` = sapply(1:3, function(i) length(E(get(paste0("bigrams_cds_silvie0", i)) %>% filter(Speaker=="FAT" & lang == "en") %>% get_network_cds()))),
  `Average weight` = paste0(sapply(1:3, function(i) mean(E(get(paste0("bigrams_cds_silvie0", i)) %>% filter(Speaker=="FAT" & lang == "en") %>% get_network_cds())$weight)) %>% round(digits = 3), " (", sapply(1:3, function(i) sd(E(get(paste0("bigrams_cds_silvie0", i)) %>% filter(Speaker=="FAT" & lang == "en") %>% get_network_cds())$weight)) %>% round(digits = 3), ")")
),

tibble(
  Speaker = "FAT",
  Child = "Silvie",
  Language = "German",
  Period = levels(factor(bigrams_cds_silvie$age_range)),
  `Number of nodes` = sapply(1:3, function(i) length(V(get(paste0("bigrams_cds_silvie0", i)) %>% filter(Speaker=="FAT" & lang == "de") %>% get_network_cds()))),
  `Number of edges` = sapply(1:3, function(i) length(E(get(paste0("bigrams_cds_silvie0", i)) %>% filter(Speaker=="FAT" & lang == "de") %>% get_network_cds()))),
  `Average weight` = paste0(sapply(1:3, function(i) mean(E(get(paste0("bigrams_cds_silvie0", i)) %>% filter(Speaker=="FAT" & lang == "de") %>% get_network_cds())$weight)) %>% round(digits = 3), " (", sapply(1:3, function(i) sd(E(get(paste0("bigrams_cds_silvie0", i)) %>% filter(Speaker=="FAT" & lang == "de") %>% get_network_cds())$weight)) %>% round(digits = 3), ")")
),

tibble(
  Speaker = "MOT",
  Child = "Silvie",
  Language = "English",
  Period = levels(factor(bigrams_cds_silvie$age_range)),
  `Number of nodes` = sapply(1:3, function(i) length(V(get(paste0("bigrams_cds_silvie0", i)) %>% filter(Speaker=="MOT" & lang == "en") %>% get_network_cds()))),
  `Number of edges` = sapply(1:3, function(i) length(E(get(paste0("bigrams_cds_silvie0", i)) %>% filter(Speaker=="MOT" & lang == "en") %>% get_network_cds()))),
  `Average weight` = paste0(sapply(1:3, function(i) mean(E(get(paste0("bigrams_cds_silvie0", i)) %>% filter(Speaker=="MOT" & lang == "en") %>% get_network_cds())$weight)) %>% round(digits = 3), " (", sapply(1:3, function(i) sd(E(get(paste0("bigrams_cds_silvie0", i)) %>% filter(Speaker=="MOT" & lang == "en") %>% get_network_cds())$weight)) %>% round(digits = 3), ")")
),

tibble(
  Speaker = "MOT",
  Child = "Silvie",
  Language = "German",
  Period = levels(factor(bigrams_cds_silvie$age_range)),
  `Number of nodes` = sapply(1:3, function(i) length(V(get(paste0("bigrams_cds_silvie0", i)) %>% filter(Speaker=="MOT" & lang == "de") %>% get_network_cds()))),
  `Number of edges` = sapply(1:3, function(i) length(E(get(paste0("bigrams_cds_silvie0", i)) %>% filter(Speaker=="MOT" & lang == "de") %>% get_network_cds()))),
  `Average weight` = paste0(sapply(1:3, function(i) mean(E(get(paste0("bigrams_cds_silvie0", i)) %>% filter(Speaker=="MOT" & lang == "de") %>% get_network_cds())$weight)) %>% round(digits = 3), " (", sapply(1:3, function(i) sd(E(get(paste0("bigrams_cds_silvie0", i)) %>% filter(Speaker=="MOT" & lang == "de") %>% get_network_cds())$weight)) %>% round(digits = 3), ")")
),

tibble(
  Speaker = "FAT",
  Child = "Fion",
  Language = "English",
  Period = levels(factor(bigrams_cds_fion$age_range)),
  `Number of nodes` = sapply(1:3, function(i) length(V(get(paste0("bigrams_cds_fion0", i)) %>% filter(Speaker=="FAT" & lang == "en") %>% get_network_cds()))),
  `Number of edges` = sapply(1:3, function(i) length(E(get(paste0("bigrams_cds_fion0", i)) %>% filter(Speaker=="FAT" & lang == "en") %>% get_network_cds()))),
  `Average weight` = paste0(sapply(1:3, function(i) mean(E(get(paste0("bigrams_cds_fion0", i)) %>% filter(Speaker=="FAT" & lang == "en") %>% get_network_cds())$weight)) %>% round(digits = 3), " (", sapply(1:3, function(i) sd(E(get(paste0("bigrams_cds_fion0", i)) %>% filter(Speaker=="FAT" & lang == "en") %>% get_network_cds())$weight)) %>% round(digits = 3), ")")
),

tibble(
  Speaker = "FAT",
  Child = "Fion",
  Language = "German",
  Period = levels(factor(bigrams_cds_fion$age_range)),
  `Number of nodes` = sapply(1:3, function(i) length(V(get(paste0("bigrams_cds_fion0", i)) %>% filter(Speaker=="FAT" & lang == "de") %>% get_network_cds()))),
  `Number of edges` = sapply(1:3, function(i) length(E(get(paste0("bigrams_cds_fion0", i)) %>% filter(Speaker=="FAT" & lang == "de") %>% get_network_cds()))),
  `Average weight` = paste0(sapply(1:3, function(i) mean(E(get(paste0("bigrams_cds_fion0", i)) %>% filter(Speaker=="FAT" & lang == "de") %>% get_network_cds())$weight)) %>% round(digits = 3), " (", sapply(1:3, function(i) sd(E(get(paste0("bigrams_cds_fion0", i)) %>% filter(Speaker=="FAT" & lang == "de") %>% get_network_cds())$weight)) %>% round(digits = 3), ")")
),

tibble(
  Speaker = "MOT",
  Child = "Fion",
  Language = "English",
  Period = levels(factor(bigrams_cds_fion$age_range)),
  `Number of nodes` = sapply(1:3, function(i) length(V(get(paste0("bigrams_cds_fion0", i)) %>% filter(Speaker=="MOT" & lang == "en") %>% get_network_cds()))),
  `Number of edges` = sapply(1:3, function(i) length(E(get(paste0("bigrams_cds_fion0", i)) %>% filter(Speaker=="MOT" & lang == "en") %>% get_network_cds()))),
  `Average weight` = paste0(sapply(1:3, function(i) mean(E(get(paste0("bigrams_cds_fion0", i)) %>% filter(Speaker=="MOT" & lang == "en") %>% get_network_cds())$weight)) %>% round(digits = 3), " (", sapply(1:3, function(i) sd(E(get(paste0("bigrams_cds_fion0", i)) %>% filter(Speaker=="MOT" & lang == "en") %>% get_network_cds())$weight)) %>% round(digits = 3), ")")
),

tibble(
  Speaker = "MOT",
  Child = "Fion",
  Language = "German",
  Period = levels(factor(bigrams_cds_fion$age_range)),
  `Number of nodes` = sapply(1:3, function(i) length(V(get(paste0("bigrams_cds_fion0", i)) %>% filter(Speaker=="MOT" & lang == "de") %>% get_network_cds()))),
  `Number of edges` = sapply(1:3, function(i) length(E(get(paste0("bigrams_cds_fion0", i)) %>% filter(Speaker=="MOT" & lang == "de") %>% get_network_cds()))),
  `Average weight` = paste0(sapply(1:3, function(i) mean(E(get(paste0("bigrams_cds_fion0", i)) %>% filter(Speaker=="MOT" & lang == "de") %>% get_network_cds())$weight)) %>% round(digits = 3), " (", sapply(1:3, function(i) sd(E(get(paste0("bigrams_cds_fion0", i)) %>% filter(Speaker=="MOT" & lang == "de") %>% get_network_cds())$weight)) %>% round(digits = 3), ")")
)

)



# pivot to wider table
network_sizes_cds %>% pivot_wider(names_from = "Period", values_from = c("Number of nodes", "Number of edges", "Average weight")) #%>% writexl::write_xlsx("data/cds_network_sizes.xlsx")
# A tibble: 8 × 21
  Speaker Child  Language `Number of nodes_02;04-02;10` Number of nodes_02;11-…¹
  <chr>   <chr>  <chr>                            <int>                    <int>
1 FAT     Silvie English                            359                      350
2 FAT     Silvie German                            1668                     1548
3 MOT     Silvie English                           2726                     2548
4 MOT     Silvie German                             636                      633
5 FAT     Fion   English                             NA                       NA
6 FAT     Fion   German                              NA                       NA
7 MOT     Fion   English                             NA                       NA
8 MOT     Fion   German                              NA                       NA
# ℹ abbreviated name: ¹​`Number of nodes_02;11-03;04`
# ℹ 16 more variables: `Number of nodes_03;05-03;09` <int>,
#   `Number of nodes_02;03-02;10` <int>, `Number of nodes_02;11-03;05` <int>,
#   `Number of nodes_03;06-03;11` <int>, `Number of edges_02;04-02;10` <int>,
#   `Number of edges_02;11-03;04` <int>, `Number of edges_03;05-03;09` <int>,
#   `Number of edges_02;03-02;10` <int>, `Number of edges_02;11-03;05` <int>,
#   `Number of edges_03;06-03;11` <int>, `Average weight_02;04-02;10` <chr>, …

Average degree of networks

We are interested in the average degree of the networks, i.e. the total number of links divided by the total number of nodes. The average degree of a network can be seen as an index of the “productivity” of the words in use (here to be understood as the extent to which they combine with other words and appear in different contexts) and of the complexity of the utterances in question.

Here it is particularly interesting to take a closer look at the average degree of the German and English utterances in isolation, as well as the entire network (which also contains code-mixed utterances).

fion_network_DE <- bigrams_fion %>% filter(type == "german") %>% get_network()
fion_network_EN <- bigrams_fion %>% filter(type == "english") %>% get_network()

# get a dataframe with degree scores for each month
# for each child and alllanguage types (English, German, mixed, all)

degree_tbl <- bind_rows(
  tibble(Month = levels(factor(bigrams_fion$Month)),
      `Average degree` = sapply(1:length(levels(factor(bigrams_fion$Month))), function(i) mean(degree(bigrams_fion %>% filter(type == "german" & Month == levels(factor(bigrams_fion$Month))[i]) %>% get_network()))),
      Child = "Fion",
      Language = "German"),

tibble(Month = levels(factor(bigrams_fion$Month)),
      `Average degree` = sapply(1:length(levels(factor(bigrams_fion$Month))), function(i) mean(degree(bigrams_fion %>% filter(type == "english" & Month == levels(factor(bigrams_fion$Month))[i]) %>% get_network()))),
      Child = "Fion",
      Language = "English"),

tibble(Month = levels(factor(bigrams_silvie$Month)),
      `Average degree` = sapply(1:length(levels(factor(bigrams_silvie$Month))), function(i) mean(degree(bigrams_silvie %>% filter(type == "german" & Month == levels(factor(bigrams_silvie$Month))[i]) %>% get_network()))),
      Child = "Silvie",
      Language = "German"),

tibble(Month = levels(factor(bigrams_silvie$Month)),
      `Average degree` = sapply(1:length(levels(factor(bigrams_silvie$Month))), function(i) mean(degree(bigrams_silvie %>% filter(type == "english" & Month == levels(factor(bigrams_silvie$Month))[i]) %>% get_network()))),
      Child = "Silvie",
      Language = "English"),

tibble(Month = levels(factor(bigrams_fion$Month)),
      `Average degree` = sapply(1:length(levels(factor(bigrams_fion$Month))), function(i) mean(degree(bigrams_fion %>% filter(type == "mixed" & Month == levels(factor(bigrams_fion$Month))[i]) %>% get_network()))),
      Child = "Fion",
      Language = "mixed"),

tibble(Month = levels(factor(bigrams_silvie$Month)),
      `Average degree` = sapply(1:length(levels(factor(bigrams_silvie$Month))), function(i) mean(degree(bigrams_silvie %>% filter(type == "mixed" & Month == levels(factor(bigrams_silvie$Month))[i]) %>% get_network()))),
      Child = "Silvie",
      Language = "mixed"),

tibble(Month = levels(factor(bigrams_fion$Month)),
      `Average degree` = sapply(1:length(levels(factor(bigrams_fion$Month))), function(i) mean(degree(bigrams_fion %>% filter(Month == levels(factor(bigrams_fion$Month))[i]) %>% get_network()))),
      Child = "Fion",
      Language = "all"),

tibble(Month = levels(factor(bigrams_silvie$Month)),
      `Average degree` = sapply(1:length(levels(factor(bigrams_silvie$Month))), function(i) mean(degree(bigrams_silvie %>% filter(Month == levels(factor(bigrams_silvie$Month))[i]) %>% get_network()))),
      Child = "Silvie",
      Language = "all")
)


# add columns for letting the "all" line appear differently
degree_tbl$alpha <- ifelse(degree_tbl$Language=="all", 0.3, 1)
degree_tbl$linetype <- ifelse(degree_tbl$Language == "all", 2, 1)
degree_tbl$linewidth <- ifelse(degree_tbl$Language == "all", 2.5, .8)

degree_tbl %>% ggplot(aes(x = Month, y = `Average degree`, group = Language, col = Language, alpha = alpha, linetype = linetype, linewidth = linewidth)) +
  geom_point() + geom_line() + facet_wrap(~Child) +
  theme(axis.text.x = element_text(angle=45, hjust=.9, size=12)) +
  scale_color_manual(values = c("darkgreen", "deepskyblue", "salmon", "tan")) + scale_alpha_identity() + scale_linetype_identity() + scale_linewidth_identity() +
  theme(axis.text = element_text(size = 18)) +
  theme(axis.title = element_text(size = 18)) +
  theme(strip.text = element_text(size = 18)) +
  theme(legend.text = element_text(size = 18)) +
  theme(legend.title = element_text(size = 18, face = "bold")) +
  theme(text = element_text(size = 18))

# ggsave("images/average_degree_fion_silvie.png", width = 12, height = 6)


# Child-directed speech

degree_tbl_cds <- bind_rows(
  tibble(
  Month = levels(factor(bigrams_cds_fion$Month)),
  `Average degree` = sapply(1:length(levels(factor(bigrams_cds_fion$Month))), function(i) mean(degree(bigrams_cds_fion %>% filter(Speaker == "FAT" & lang == "en" & Month == levels(factor(bigrams_cds_fion$Month))[i]) %>% get_network_cds()))),
  Speaker = "Fion's father",
  Language = "English"
),

tibble(
  Month = levels(factor(bigrams_cds_fion$Month)),
  `Average degree` = sapply(1:length(levels(factor(bigrams_cds_fion$Month))), function(i) mean(degree(bigrams_cds_fion %>% filter(Speaker == "FAT" & lang == "de" & Month == levels(factor(bigrams_cds_fion$Month))[i]) %>% get_network_cds()))),
  Speaker = "Fion's father",
  Language = "German"
),

tibble(
  Month = levels(factor(bigrams_cds_fion$Month)),
  `Average degree` = sapply(1:length(levels(factor(bigrams_cds_fion$Month))), function(i) mean(degree(bigrams_cds_fion %>% filter(Speaker == "FAT" & Month == levels(factor(bigrams_cds_fion$Month))[i]) %>% get_network_cds()))),
  Speaker = "Fion's father",
  Language = "all"
),

tibble(
  Month = levels(factor(bigrams_cds_silvie$Month)),
  `Average degree` = sapply(1:length(levels(factor(bigrams_cds_silvie$Month))), function(i) mean(degree(bigrams_cds_silvie %>% filter(Speaker == "FAT" & lang == "en" & Month == levels(factor(bigrams_cds_silvie$Month))[i]) %>% get_network_cds()))),
  Speaker = "Silvie's father",
  Language = "English"
),

tibble(
  Month = levels(factor(bigrams_cds_silvie$Month)),
  `Average degree` = sapply(1:length(levels(factor(bigrams_cds_silvie$Month))), function(i) mean(degree(bigrams_cds_silvie %>% filter(Speaker == "FAT" & lang == "de" & Month == levels(factor(bigrams_cds_silvie$Month))[i]) %>% get_network_cds()))),
  Speaker = "Silvie's father",
  Language = "German"
),

tibble(
  Month = levels(factor(bigrams_cds_silvie$Month)),
  `Average degree` = sapply(1:length(levels(factor(bigrams_cds_silvie$Month))), function(i) mean(degree(bigrams_cds_silvie %>% filter(Speaker == "FAT" & Month == levels(factor(bigrams_cds_silvie$Month))[i]) %>% get_network_cds()))),
  Speaker = "Silvie's father",
  Language = "all"
),

tibble(
  Month = levels(factor(bigrams_cds_fion$Month)),
  `Average degree` = sapply(1:length(levels(factor(bigrams_cds_fion$Month))), function(i) mean(degree(bigrams_cds_fion %>% filter(Speaker == "MOT" & lang == "en" & Month == levels(factor(bigrams_cds_fion$Month))[i]) %>% get_network_cds()))),
  Speaker = "Fion's mother",
  Language = "English"
),

tibble(
  Month = levels(factor(bigrams_cds_fion$Month)),
  `Average degree` = sapply(1:length(levels(factor(bigrams_cds_fion$Month))), function(i) mean(degree(bigrams_cds_fion %>% filter(Speaker == "MOT" & lang == "de" & Month == levels(factor(bigrams_cds_fion$Month))[i]) %>% get_network_cds()))),
  Speaker = "Fion's mother",
  Language = "German"
),

tibble(
  Month = levels(factor(bigrams_cds_fion$Month)),
  `Average degree` = sapply(1:length(levels(factor(bigrams_cds_fion$Month))), function(i) mean(degree(bigrams_cds_fion %>% filter(Speaker == "MOT" & Month == levels(factor(bigrams_cds_fion$Month))[i]) %>% get_network_cds()))),
  Speaker = "Fion's mother",
  Language = "all"
),

tibble(
  Month = levels(factor(bigrams_cds_silvie$Month)),
  `Average degree` = sapply(1:length(levels(factor(bigrams_cds_silvie$Month))), function(i) mean(degree(bigrams_cds_silvie %>% filter(Speaker == "MOT" & lang == "en" & Month == levels(factor(bigrams_cds_silvie$Month))[i]) %>% get_network_cds()))),
  Speaker = "Silvie's mother",
  Language = "English"
),

tibble(
  Month = levels(factor(bigrams_cds_silvie$Month)),
  `Average degree` = sapply(1:length(levels(factor(bigrams_cds_silvie$Month))), function(i) mean(degree(bigrams_cds_silvie %>% filter(Speaker == "MOT" & lang == "de" & Month == levels(factor(bigrams_cds_silvie$Month))[i]) %>% get_network_cds()))),
  Speaker = "Silvie's mother",
  Language = "German"
),

tibble(
  Month = levels(factor(bigrams_cds_silvie$Month)),
  `Average degree` = sapply(1:length(levels(factor(bigrams_cds_silvie$Month))), function(i) mean(degree(bigrams_cds_silvie %>% filter(Speaker == "MOT" & Month == levels(factor(bigrams_cds_silvie$Month))[i]) %>% get_network_cds()))),
  Speaker = "Silvie's mother",
  Language = "all"
)

)

# add columns for letting the "all" line appear differently
degree_tbl_cds$alpha <- ifelse(degree_tbl_cds$Language=="all", 0.3, 1)
degree_tbl_cds$linetype <- ifelse(degree_tbl_cds$Language == "all", 2, 1)
degree_tbl_cds$linewidth <- ifelse(degree_tbl_cds$Language == "all", 2.5, .8)

degree_tbl_cds %>% ggplot(aes(x = Month, y = `Average degree`, group = Language, col = Language, alpha = alpha, linetype = linetype, linewidth = linewidth)) + geom_point() + geom_line() + facet_wrap(~Speaker) +
  theme(axis.text.x = element_text(angle=45, hjust=.9, size=12)) +
  scale_color_manual(values = c("darkgreen", "deepskyblue", "salmon")) + scale_alpha_identity() + scale_linetype_identity() + scale_linewidth_identity() +
  theme(axis.text = element_text(size = 18)) +
  theme(axis.title = element_text(size = 18)) +
  theme(strip.text = element_text(size = 18)) +
  theme(legend.text = element_text(size = 18)) +
  theme(legend.title = element_text(size = 18, face = "bold")) +
  theme(text = element_text(size = 18))

  # ggsave("images/degree_cds.png", width = 10, height = 10)

Distribution of languages in CDS

To better understand the outcomes of the DNM analysis, we check the number of utterances and the distribution of different language tags in the CDS data.

# get number of utterances and average number of words per utterance for each speaker and month

se <- function(x) sqrt(var(x) / length(x))
 
fion_parents_mlu <- fion_cds %>% filter(Speaker %in% c("MOT", "FAT")) %>% group_by(Speaker, Month, lang) %>% summarise(
  mean = mean(Wordcount),
  se = se(Wordcount),
  se_min = mean-se,
  se_max = mean+se
) %>% ggplot(aes(x = Month, y = mean, group = lang, col = lang)) + geom_point() + geom_line() + geom_errorbar(aes(ymin = se_min, ymax = se_max)) + facet_wrap(~Speaker) + ylab("Mean length of utterances") +
  scale_color_manual(values = c("salmon", "deepskyblue")) +
  theme(axis.text.x = element_text(angle=45, hjust=.9, size=12)) +
  theme(axis.text = element_text(size = 18)) +
  theme(axis.title = element_text(size = 18)) +
  theme(strip.text = element_text(size = 18)) +
  theme(legend.text = element_text(size = 18)) +
  theme(legend.title = element_text(size = 18, face = "bold")) +
  theme(text = element_text(size = 18)) + ggtitle("Fion's parents") + theme(plot.title = element_text(face = "bold", hjust = 0.5)) 

silvie_parents_mlu <- silvie_cds %>% filter(Speaker %in% c("MOT", "FAT")) %>% group_by(Speaker, Month, lang) %>% summarise(
  mean = mean(Wordcount),
  se = se(Wordcount),
  se_min = mean-se,
  se_max = mean+se
) %>% ggplot(aes(x = Month, y = mean, group = lang, col = lang)) + geom_point() + geom_line() + geom_errorbar(aes(ymin = se_min, ymax = se_max)) + facet_wrap(~Speaker) + ylab("Mean length of utterances") +
  scale_color_manual(values = c("salmon", "deepskyblue")) +
  theme(axis.text.x = element_text(angle=45, hjust=.9, size=12)) +
  theme(axis.text = element_text(size = 18)) +
  theme(axis.title = element_text(size = 18)) +
  theme(strip.text = element_text(size = 18)) +
  theme(legend.text = element_text(size = 18)) +
  theme(legend.title = element_text(size = 18, face = "bold")) +
  theme(text = element_text(size = 18)) + ggtitle("Silvie's parents") + theme(plot.title = element_text(face = "bold", hjust = 0.5))

fion_parents_mlu /
  silvie_parents_mlu + plot_layout(axis_titles = "collect", guides = "collect")

fion_parents_n <- fion_cds %>% filter(Speaker %in% c("MOT", "FAT")) %>% group_by(Speaker, Month, lang) %>% filter(Speaker %in% c("MOT", "FAT")) %>% group_by(Speaker, Month, lang) %>% summarise(
  n = n(),
) %>% mutate(Language = case_when(lang=="de" ~ "German", lang=="en" ~ "English")) %>%  ggplot(aes(x = Month, y = n, group = Language, col = Language)) + geom_point() + geom_line() + facet_wrap(~Speaker) + ylab("Number of utterances") +
  scale_color_manual(values = c("salmon", "deepskyblue")) +
  theme(axis.text.x = element_text(angle=45, hjust=.9, size=12)) +
  theme(axis.text = element_text(size = 18)) +
  theme(axis.title = element_text(size = 18)) +
  theme(strip.text = element_text(size = 18)) +
  theme(legend.text = element_text(size = 18)) +
  theme(legend.title = element_text(size = 18, face = "bold")) +
  theme(text = element_text(size = 18)) + ggtitle("Fion's parents") + theme(plot.title = element_text(face = "bold", hjust = 0.5)) 

silvie_parents_n <-silvie_cds %>% filter(Speaker %in% c("MOT", "FAT")) %>% group_by(Speaker, Month, lang) %>% summarise(
  n = n(),
) %>% mutate(Language = case_when(lang=="de" ~ "German", lang=="en" ~ "English")) %>% ggplot(aes(x = Month, y = n, group = Language, col = Language)) + geom_point() + geom_line() + facet_wrap(~Speaker) + ylab("Number of utterances") +
  scale_color_manual(values = c("salmon", "deepskyblue")) +
  theme(axis.text.x = element_text(angle=45, hjust=.9, size=12)) +
  theme(axis.text = element_text(size = 18)) +
  theme(axis.title = element_text(size = 18)) +
  theme(strip.text = element_text(size = 18)) +
  theme(legend.text = element_text(size = 18)) +
  theme(legend.title = element_text(size = 18, face = "bold")) +
  theme(text = element_text(size = 18)) + ggtitle("Silvie's parents") + theme(plot.title = element_text(face = "bold", hjust = 0.5)) 

fion_parents_n /
  silvie_parents_n + plot_layout(axis_titles = "collect", guides = "collect")

# ggsave("images/cds_number_of_utterances.png", width = 11, height = 9)

Modularities

Modularity measures the density of links inside communities compared to links between different communities and is defined as a value between -1 and 1.

Child-directed speech

cds_modularities <- bind_rows(
  tibble(
  Speaker = "MOT",
  Language = "English",
  Modularity = sapply(1:length(levels(factor(bigrams_cds_fion$Month))), function(i) bigrams_cds_fion %>% filter(Month == levels(factor(bigrams_cds_fion$Month))[i] & Speaker == "MOT" & lang == "en") %>% get_network_cds() %>% cluster_louvain(resolution = .3) %>% modularity()  ),
  Month = levels(factor(bigrams_cds_fion$Month)),
  Child = "Fion"
),

tibble(
  Speaker = "MOT",
  Language = "German",
  Modularity = sapply(1:length(levels(factor(bigrams_cds_fion$Month))), function(i) bigrams_cds_fion %>% filter(Month == levels(factor(bigrams_cds_fion$Month))[i] & Speaker == "MOT" & lang == "de") %>% get_network_cds() %>% cluster_louvain(resolution = .3) %>% modularity()  ),
  Month = levels(factor(bigrams_cds_fion$Month)),
  Child = "Fion"
),

tibble(
  Speaker = "FAT",
  Language = "English",
  Modularity = sapply(1:length(levels(factor(bigrams_cds_fion$Month))), function(i) bigrams_cds_fion %>% filter(Month == levels(factor(bigrams_cds_fion$Month))[i] & Speaker == "FAT" & lang=="en") %>% get_network_cds() %>% cluster_louvain(resolution = .3) %>% modularity()  ),
  Month = levels(factor(bigrams_cds_fion$Month)),
  Child = "Fion"
),

tibble(
  Speaker = "FAT",
  Language = "German",
  Modularity = sapply(1:length(levels(factor(bigrams_cds_fion$Month))), function(i) bigrams_cds_fion %>% filter(Month == levels(factor(bigrams_cds_fion$Month))[i] & Speaker == "FAT" & lang=="de") %>% get_network_cds() %>% cluster_louvain(resolution = .3) %>% modularity()  ),
  Month = levels(factor(bigrams_cds_fion$Month)),
  Child = "Fion"
),

tibble(
  Speaker = "MOT",
  Language = "English",
  Modularity = sapply(1:length(levels(factor(bigrams_cds_silvie$Month))), function(i) bigrams_cds_silvie %>% filter(Month == levels(factor(bigrams_cds_silvie$Month))[i] & Speaker == "MOT" & lang=="en") %>% get_network_cds() %>% cluster_louvain(resolution = .3) %>% modularity()  ),
  Month = levels(factor(bigrams_cds_silvie$Month)),
  Child = "Silvie"
),

tibble(
  Speaker = "MOT",
  Language = "German",
  Modularity = sapply(1:length(levels(factor(bigrams_cds_silvie$Month))), function(i) bigrams_cds_silvie %>% filter(Month == levels(factor(bigrams_cds_silvie$Month))[i] & Speaker == "MOT" & lang=="de") %>% get_network_cds() %>% cluster_louvain(resolution = .3) %>% modularity()  ),
  Month = levels(factor(bigrams_cds_silvie$Month)),
  Child = "Silvie"
),

tibble(
  Speaker = "FAT",
  Language = "English",
  Modularity = sapply(1:length(levels(factor(bigrams_cds_silvie$Month))), function(i) bigrams_cds_silvie %>% filter(Month == levels(factor(bigrams_cds_silvie$Month))[i] & Speaker == "FAT" & lang == "en") %>% get_network_cds() %>% cluster_louvain(resolution = .3) %>% modularity()  ),
  Month = levels(factor(bigrams_cds_silvie$Month)),
  Child = "Silvie"
),

tibble(
  Speaker = "FAT",
  Language = "German",
  Modularity = sapply(1:length(levels(factor(bigrams_cds_silvie$Month))), function(i) bigrams_cds_silvie %>% filter(Month == levels(factor(bigrams_cds_silvie$Month))[i] & Speaker == "FAT" & lang == "de") %>% get_network_cds() %>% cluster_louvain(resolution = .3) %>% modularity()  ),
  Month = levels(factor(bigrams_cds_silvie$Month)),
  Child = "Silvie"
)
) 

cds_modularities %>% ggplot(aes(x = Month, y = Modularity, group = Language, col = Language)) + geom_point() + geom_line() + facet_grid(Child~Speaker, scales = "free_x") +
  theme(axis.text.x = element_text(angle=45, hjust=.9, size=12)) +
  theme(axis.text = element_text(size = 18)) +
  theme(axis.title = element_text(size = 18)) +
  theme(strip.text = element_text(size = 18)) +
  theme(legend.text = element_text(size = 18)) +
  theme(legend.title = element_text(size = 18, face = "bold")) +
  theme(text = element_text(size = 18))

# ggsave("images/cds_modularities.png", width = 12, height = 10)

# does the outlier in the mother's data have to do with data sparsity? - apparently not!

# fion_cds %>% group_by(Speaker, Month) %>% summarise(
#   n=n()
# ) %>% filter(Speaker %in% c("MOT", "FAT")) %>%
#   ggplot(aes(x = Month, y = n, group = Speaker, col = Speaker)) +
# geom_point() +  geom_line()
# 
# bigrams_cds_fion %>% filter(Month == "03;08") %>% get_network_cds() %>% get_plot(interactive = T)
# bigrams_cds_fion %>% filter(Month == "03;09") %>% get_network_cds() %>% get_plot()
# bigrams_cds_fion %>% filter(Month == "03;10") %>% get_network_cds() %>% get_plot(interactive = T)

Child speech

# modularities over time - original data
bind_rows(
  tibble(
  modularity = sapply(1:length(levels(factor(bigrams_fion$Month))),
       function(i) filter(bigrams_fion, Month == levels(factor(bigrams_fion$Month))[i]) %>% get_network() %>% cluster_louvain(resolution = .3) %>% modularity()),
  age = levels(factor(bigrams_fion$Month)),
  Child = "Fion"
),

tibble(
  modularity = sapply(1:length(levels(factor(bigrams_silvie$Month))),
       function(i) filter(bigrams_silvie, Month == levels(factor(bigrams_silvie$Month))[i]) %>% get_network() %>% cluster_louvain(resolution = .3) %>% modularity()),
  age = levels(factor(bigrams_silvie$Month)),
  Child = "Silvie"
)
) %>% ggplot(aes(x = age, y = modularity, group = Child, col = Child, pch = Child)) + geom_point() + geom_line() +
  theme(axis.text.x = element_text(angle=45, hjust=.9, size=12)) +
  geom_smooth(linewidth = 0) + ylab("Modularity") + xlab("Age")

# Modularities over time - sampled data
modularities_sampled <- bind_rows(
  tibble(
  Period = levels(factor(bigrams_fion$age_range)),
  Modularity = sapply(1:7, function(i) filter(get(paste0("bigrams_fion0", i)), type == "german") %>% get_network() %>% cluster_louvain(resolution = .3) %>% modularity()),
  Child = "Fion",
  Language = "German"
),

tibble(
  Period = levels(factor(bigrams_fion$age_range)),
  Modularity = sapply(1:7, function(i) filter(get(paste0("bigrams_fion0", i)), type == "english") %>% get_network() %>% cluster_louvain(resolution = .3) %>% modularity()),
  Child = "Fion",
  Language = "English"
),

tibble(
  Period = levels(factor(bigrams_fion$age_range)),
  Modularity = sapply(1:7, function(i) filter(get(paste0("bigrams_fion0", i)), type == "mixed") %>% get_network() %>% cluster_louvain(resolution = .3) %>% modularity()),
  Child = "Fion",
  Language = "mixed"
),

tibble(
  Period = levels(factor(bigrams_fion$age_range)),
  Modularity = sapply(1:7, function(i) filter(get(paste0("bigrams_fion0", i))) %>% get_network() %>% cluster_louvain(resolution = .3) %>% modularity()),
  Child = "Fion",
  Language = "all"
),

tibble(
  Period = levels(factor(bigrams_silvie$age_range)),
  Modularity = sapply(1:6, function(i) filter(get(paste0("bigrams_silvie0", i)), type == "german") %>% get_network() %>% cluster_louvain(resolution = .3) %>% modularity()),
  Child = "Silvie",
  Language = "German"
),

tibble(
  Period = levels(factor(bigrams_silvie$age_range)),
  Modularity = sapply(1:6, function(i) filter(get(paste0("bigrams_silvie0", i)), type == "english") %>% get_network() %>% cluster_louvain(resolution = .3) %>% modularity()),
  Child = "Silvie",
  Language = "English"
),

tibble(
  Period = levels(factor(bigrams_silvie$age_range)),
  Modularity = sapply(1:6, function(i) filter(get(paste0("bigrams_silvie0", i)), type == "mixed") %>% get_network() %>% cluster_louvain(resolution = .3) %>% modularity()),
  Child = "Silvie",
  Language = "mixed"
),

tibble(
  Period = levels(factor(bigrams_silvie$age_range)),
  Modularity = sapply(1:6, function(i) filter(get(paste0("bigrams_silvie0", i))) %>% get_network() %>% cluster_louvain(resolution = .3) %>% modularity()),
  Child = "Silvie",
  Language = "all"
)
)

# add columns for letting the "all" line appear differently
modularities_sampled$alpha <- ifelse(modularities_sampled$Language=="all", 0.3, 1)
modularities_sampled$linetype <- ifelse(modularities_sampled$Language == "all", 2, 1)
modularities_sampled$linewidth <- ifelse(modularities_sampled$Language == "all", 2.5, .8)


modularities_sampled %>% 
  ggplot(aes(x = Period, y = Modularity, group = Language, color = Language, linewidth = linewidth, linetype = linetype, alpha = alpha)) + geom_point() + geom_line() + facet_wrap(~Child, scales = "free_x") +
  scale_linetype_identity() +
  scale_alpha_identity() +
  scale_linewidth_identity() +
  scale_color_manual(values = c("darkgreen", "deepskyblue", "salmon", "tan")) +
  theme(axis.text.x = element_text(angle=45, hjust=.9, size=12)) +
  theme(axis.text = element_text(size = 18)) +
  theme(axis.title = element_text(size = 18)) +
  theme(strip.text = element_text(size = 18)) +
  theme(legend.text = element_text(size = 18)) +
  theme(legend.title = element_text(size = 18, face = "bold")) +
  theme(text = element_text(size = 18)) +
  guides(alpha = "none", linewidth = "none")

# ggsave("images/modularities450.png", width = 12, height = 6)  

# What happens with Fion's English data in the 3rd period?
fionp3 <- bigrams_fion %>% filter(age_range == levels(factor(bigrams_fion$age_range))[3] & type == "english") %>% get_network() %>% get_plot() + ggtitle(paste0("Fion's English data, ", levels(factor(bigrams_fion$age_range))[3])) +
  theme(plot.title = element_text(face = "bold", hjust = 0.5))

# compare this to the fourth period:
fionp4 <- bigrams_fion %>% filter(age_range == levels(factor(bigrams_fion$age_range))[4] & type == "english") %>% get_network() %>% get_plot() + ggtitle(paste0("Fion's English data, ", levels(factor(bigrams_fion$age_range))[4])) +
  theme(plot.title = element_text(face = "bold", hjust = 0.5))

(fionp3 | fionp4)

# ggsave("images/fion_english_data_3and4_450.png", width = 12, height = 5)

Number of communities

How many communities are identified?

# communities divided by network size:
sapply(1:7, function(i) get(paste0("network_fion0", i)) %>% cluster_louvain(resolution = .3) %>% communities() %>% length()) / sapply(1:7, function(i) length(V(get(paste0("network_fion0", i)))))
[1] 0.17374517 0.13513514 0.10859729 0.13223140 0.11063830 0.08032129 0.09405941
sapply(1:6, function(i) get(paste0("network_silvie0", i)) %>% cluster_louvain(resolution = .3) %>% communities() %>% length()) / sapply(1:6, function(i) length(V(get(paste0("network_silvie0", i)))))
[1] 0.1254480 0.1553398 0.1344262 0.1147541 0.1025641 0.1317568
# relatively stable

Hub and authority scores

Child-directed speech

For each age range, we want to get the top 20 hubs and authorities:

# networks for Fion's and Silvie's mothers and fathers
for(i in 1:3) {
  assign(paste0("network_silvie_MOT_0", i), get(paste0("bigrams_cds_silvie0", i)) %>% filter(Speaker == "MOT") %>%
           select(word1, word2) %>% na.omit %>% graph_from_data_frame(directed = TRUE))
}

for(i in 1:3) {
  assign(paste0("network_silvie_FAT_0", i), get(paste0("bigrams_cds_silvie0", i)) %>% filter(Speaker == "FAT") %>%
           select(word1, word2) %>% na.omit %>% graph_from_data_frame(directed = TRUE))
}

for(i in 1:3) {
  assign(paste0("network_fion_MOT_0", i), get(paste0("bigrams_cds_fion0", i)) %>% filter(Speaker == "MOT") %>%
           select(word1, word2) %>% na.omit %>% graph_from_data_frame(directed = TRUE))
}

for(i in 1:3) {
  assign(paste0("network_fion_FAT_0", i), get(paste0("bigrams_cds_fion0", i)) %>% filter(Speaker == "FAT") %>% select(word1, word2) %>% na.omit %>% graph_from_data_frame(directed = TRUE))
}

Top 20 hubs: Fion’s father

lapply(1:3, function(i) get(paste0("network_fion_FAT_0", i)) %>% hits_scores() %>% .$hub %>% enframe(name = "word", value = "Hub score") %>% arrange(desc(`Hub score`)) %>% head(20) %>% select(word) %>% setNames(levels(factor(bigrams_cds_fion$age_range))[i])) %>% bind_cols() #%>%
# A tibble: 20 × 3
   `02;03-02;10` `02;11-03;05` `03;06-03;11`
   <chr>         <chr>         <chr>        
 1 do            do            do           
 2 are           are           are          
 3 did           did           did          
 4 is            will          is           
 5 nt            is            nt           
 6 will          nt            will         
 7 you           what          you          
 8 finty         if            if           
 9 can           and           then         
10 with          that          wo           
11 for           you           ca           
12 now           can           and          
13 sure          it            now          
14 what          then          no           
15 to            would         what         
16 were          s             when         
17 then          with          with         
18 if            finty         to           
19 wo            does          fintey       
20 s             for           for          
  #writexl::write_xlsx("top20_hubs_fion_FAT.xlsx")

Top 20 hubs: Fion’s mother

lapply(1:3, function(i) get(paste0("network_fion_MOT_0", i)) %>% hits_scores() %>% .$hub %>% enframe(name = "word", value = "Hub score") %>% arrange(desc(`Hub score`)) %>% head(20) %>% select(word) %>% setNames(levels(factor(bigrams_cds_fion$age_range))[i])) %>% bind_cols() # %>%
# A tibble: 20 × 3
   `02;03-02;10` `02;11-03;05` `03;06-03;11`
   <chr>         <chr>         <chr>        
 1 hast          it            ich          
 2 du            that          und          
 3 und           what          du           
 4 willst        he            was          
 5 was           und           ist          
 6 ist           du            finty        
 7 finty         let           ja           
 8 wenn          fiony         in           
 9 ja            there         wenn         
10 das           finty         hast         
11 kannst        mal           das          
12 bist          ich           mal          
13 weisst        ja            aber         
14 musst         geht          sagt         
15 machst        where         soll         
16 ach           mack          denn         
17 mal           ist           sind         
18 denn          brian         ach          
19 heisst        in            habe         
20 oh            i             gesagt       
  # writexl::write_xlsx("top20_hubs_fion_MOT.xlsx")

Top 20 hubs: Silvie’s father

lapply(1:3, function(i) get(paste0("network_silvie_FAT_0", i)) %>% hits_scores() %>% .$hub %>% enframe(name = "word", value = "Hub score") %>% arrange(desc(`Hub score`)) %>% head(20) %>% select(word) %>% setNames(levels(factor(bigrams_cds_silvie$age_range))[i])) %>% bind_cols() # %>%
# A tibble: 20 × 3
   `02;04-02;10` `02;11-03;04` `03;05-03;09`
   <chr>         <chr>         <chr>        
 1 du            ich           und          
 2 und           na            du           
 3 das           du            ich          
 4 ich           ja            ja           
 5 wir           und           das          
 6 hast          hast          die          
 7 na            musst         na           
 8 ja            aber          wenn         
 9 die           wir           wir          
10 oh            die           aber         
11 kannst        so            sind         
12 bist          das           hast         
13 jetzt         denn          kann         
14 so            wenn          mal          
15 musst         oh            was          
16 wie           sind          hat          
17 willst        komm          wie          
18 emily         dass          da           
19 dann          guck          bist         
20 wenn          kannst        ist          
  # writexl::write_xlsx("top20_hubs_silvie_FAT.xlsx")

Top 20 hubs: Silvie’s mother

lapply(1:3, function(i) get(paste0("network_silvie_MOT_0", i)) %>% hits_scores() %>% .$hub %>% enframe(name = "word", value = "Hub score") %>% arrange(desc(`Hub score`)) %>% head(20) %>% select(word) %>% setNames(levels(factor(bigrams_cds_silvie$age_range))[i])) %>% bind_cols() # %>%
# A tibble: 20 × 3
   `02;04-02;10` `02;11-03;04` `03;05-03;09`
   <chr>         <chr>         <chr>        
 1 that          that          that         
 2 it            it            it           
 3 let           let           what         
 4 what          what          let          
 5 he            he            he           
 6 daddy         there         there        
 7 there         daddy         you          
 8 where         do            do           
 9 you           where         where        
10 are           you           have         
11 do            have          daddy        
12 who           one           are          
13 one           are           we           
14 have          here          to           
15 can           can           and          
16 she           s             who          
17 s             we            s            
18 this          she           can          
19 and           and           if           
20 if            to            nt           
   # writexl::write_xlsx("top20_hubs_silvie_MOT.xlsx")

Top 20 authorities: Fion’s father

lapply(1:3, function(i) get(paste0("network_fion_FAT_0", i)) %>% hits_scores() %>% .$authority %>% enframe(name = "word", value = "authority score") %>% arrange(desc(`authority score`)) %>% head(20) %>% select(word) %>% setNames(levels(factor(bigrams_cds_fion$age_range))[i])) %>% bind_cols() #%>%
# A tibble: 20 × 3
   `02;03-02;10` `02;11-03;05` `03;06-03;11`
   <chr>         <chr>         <chr>        
 1 you           you           nt           
 2 nt            nt            you          
 3 it            it            it           
 4 they          that          that         
 5 that          s             the          
 6 the           i             re           
 7 he            the           i            
 8 a             a             do           
 9 i             they          a            
10 s             we            s            
11 want          do            your         
12 do            he            have         
13 re            re            they         
14 to            your          he           
15 know          know          we           
16 your          have          want         
17 have          to            is           
18 we            is            can          
19 finty         want          to           
20 what          what          know         
  #writexl::write_xlsx("top20_authorities_fion_FAT.xlsx")

Top 20 authorities: Fion’s mother

lapply(1:3, function(i) get(paste0("network_fion_MOT_0", i)) %>% hits_scores() %>% .$authority %>% enframe(name = "word", value = "authority score") %>% arrange(desc(`authority score`)) %>% head(20) %>% select(word) %>% setNames(levels(factor(bigrams_cds_fion$age_range))[i])) %>% bind_cols() #%>%
# A tibble: 20 × 3
   `02;03-02;10` `02;11-03;05` `03;06-03;11`
   <chr>         <chr>         <chr>        
 1 du            s             ich          
 2 das           was           habe         
 3 denn          is            der          
 4 ist           das           du           
 5 hast          du            die          
 6 was           in            das          
 7 bist          it            da           
 8 da            the           was          
 9 so            he            auch         
10 die           on            nicht        
11 ich           does          muss         
12 nicht         ich           dann         
13 der           did           dir          
14 ja            from          ja           
15 dann          nicht         mal          
16 finty         finty         denn         
17 auch          have          hier         
18 mal           do            zu           
19 kannst        die           will         
20 noch          hier          bin          
  #writexl::write_xlsx("top20_authorities_fion_MOT.xlsx")

Top 20 authorities: Silvie’s father

lapply(1:3, function(i) get(paste0("network_silvie_FAT_0", i)) %>% hits_scores() %>% .$authority %>% enframe(name = "word", value = "authority score") %>% arrange(desc(`authority score`)) %>% head(20) %>% select(word) %>% setNames(levels(factor(bigrams_cds_silvie$age_range))[i])) %>% bind_cols() #%>%
# A tibble: 20 × 3
   `02;04-02;10` `02;11-03;04` `03;05-03;09`
   <chr>         <chr>         <chr>        
 1 das           das           das          
 2 du            bin           die          
 3 die           du            du           
 4 dann          na            ich          
 5 bin           die           sind         
 6 hast          ich           ja           
 7 noch          ja            dann         
 8 auch          hast          noch         
 9 sind          noch          nicht        
10 kannst        dann          wir          
11 war           denke         hier         
12 da            der           was          
13 jetzt         kannst        kannst       
14 mal           so            so           
15 ich           mit           mit          
16 machen        da            mal          
17 hier          wir           bin          
18 bist          auch          dir          
19 den           komm          war          
20 emily         aber          auch         
  #writexl::write_xlsx("top20_authorities_silvie_FAT.xlsx")

Top 20 authorities: Silvie’s mother

lapply(1:3, function(i) get(paste0("network_silvie_MOT_0", i)) %>% hits_scores() %>% .$authority %>% enframe(name = "word", value = "authority score") %>% arrange(desc(`authority score`)) %>% head(20) %>% select(word) %>% setNames(levels(factor(bigrams_cds_silvie$age_range))[i])) %>% bind_cols() # %>%
# A tibble: 20 × 3
   `02;04-02;10` `02;11-03;04` `03;05-03;09`
   <chr>         <chr>         <chr>        
 1 s             s             s            
 2 you           you           you          
 3 was           one           is           
 4 is            was           was          
 5 one           is            one          
 6 are           we            in           
 7 a             do            do           
 8 do            to            are          
 9 did           the           we           
10 does          a             the          
11 the           in            a            
12 can           does          to           
13 we            can           have         
14 in            it            nt           
15 have          nt            can          
16 it            all           does         
17 nice          are           re           
18 to            have          all          
19 goes          did           it           
20 nt            i             did          
   # writexl::write_xlsx("top20_authorities_silvie_MOT.xlsx")

Child speech

# get top 20 hubs for each of the six/seven periods in Silvie's and Fion's data

bind_cols(lapply(1:7, function(i) get(paste0("network_fion0", i)) %>% hits_scores() %>% .$authority %>% enframe() %>% head(20) %>% select(name) %>% setNames(levels(factor(bigrams_fion$age_range))[i]))) #%>% writexl::write_xlsx("fion_authorities.xlsx")
# A tibble: 20 × 7
   `02;03-02;06` `02;07-02;09` `02;10-03;00` `03;01-03;03` `03;04-03;06`
   <chr>         <chr>         <chr>         <chr>         <chr>        
 1 nein          guck          this          das           das          
 2 schiff        ich           istis         die           italien      
 3 und           das           mache         ich           ich          
 4 show          die           auch          du            ist          
 5 time          und           da            ein           mach         
 6 ein           sit           hueh          weiss         meiner       
 7 me            da            ich           aber          und          
 8 one           hoch          und           wohnen        mal          
 9 i             ist           jetzt         fuenf         da           
10 da            meine         aber          fussball      habe         
11 in            nein          der           gehen         hey          
12 mein          noch          die           hab           meine        
13 a             oder          fanta         ist           nicht        
14 an            weiss         feuerwehr     nicht         ruck         
15 baby          will          herehier      puppe         do           
16 anderer       eier          in            sind          night        
17 anderanderer  ein           musik         wann          this         
18 das           haende        nein          was           herehier     
19 eine          kann          schokolade    wo            die          
20 er            mehr          so            ahoi          aus          
# ℹ 2 more variables: `03;07-03;09` <chr>, `03;10-03;11` <chr>
bind_cols(lapply(1:6, function(i) get(paste0("network_silvie0", i)) %>% hits_scores() %>% .$authority %>% enframe() %>% head(20) %>% select(name) %>% setNames(levels(factor(bigrams_fion$age_range))[i]))) #%>% writexl::write_xlsx("silvie_authorities.xlsx")
# A tibble: 20 × 6
   `02;03-02;06` `02;07-02;09` `02;10-03;00` `03;01-03;03` `03;04-03;06`
   <chr>         <chr>         <chr>         <chr>         <chr>        
 1 ich           gar           ich           mache         gar          
 2 this          das           auf           das           das          
 3 noch          hallo         aber          ich           guck         
 4 das           ich           bin           kannst        ich          
 5 nein          derdie        der           der           ist          
 6 piep          ein           habe          ja            und          
 7 da            hast          jetzt         nein          du           
 8 der           nicht         krabbel       sein          nein         
 9 du            noch          bear          stop          noch         
10 ein           und           candle        for           wir          
11 eins          wir           happy         au            bye          
12 haben         wollen        the           doch          isaac        
13 jetzt         foot          time          drauf         birthday     
14 kann          ice           yeah          du            aber         
15 moechte       the           rote          fertig        anfangen     
16 und           washing       auch          fuenf         da           
17 weiss         aber          da            guck          dem          
18 zwei          auf           das           habe          der          
19 a             bin           du            hat           ganz         
20 big           da            geht          immer         haben        
# ℹ 1 more variable: `03;07-03;09` <chr>
bind_cols(lapply(1:7, function(i) get(paste0("network_fion0", i)) %>% hits_scores() %>% .$hub %>% enframe() %>% head(20) %>% select(name) %>% setNames(levels(factor(bigrams_fion$age_range))[i]))) # %>% writexl::write_xlsx("fion_hubs.xlsx")
# A tibble: 20 × 7
   `02;03-02;06` `02;07-02;09` `02;10-03;00` `03;01-03;03` `03;04-03;06`
   <chr>         <chr>         <chr>         <chr>         <chr>        
 1 nein          guck          this          das           das          
 2 schiff        ich           istis         die           italien      
 3 und           das           mache         ich           ich          
 4 show          die           auch          du            ist          
 5 time          und           da            ein           mach         
 6 ein           sit           hueh          weiss         meiner       
 7 me            da            ich           aber          und          
 8 one           hoch          und           wohnen        mal          
 9 i             ist           jetzt         fuenf         da           
10 da            meine         aber          fussball      habe         
11 in            nein          der           gehen         hey          
12 mein          noch          die           hab           meine        
13 a             oder          fanta         ist           nicht        
14 an            weiss         feuerwehr     nicht         ruck         
15 baby          will          herehier      puppe         do           
16 anderer       eier          in            sind          night        
17 anderanderer  ein           musik         wann          this         
18 das           haende        nein          was           herehier     
19 eine          kann          schokolade    wo            die          
20 er            mehr          so            ahoi          aus          
# ℹ 2 more variables: `03;07-03;09` <chr>, `03;10-03;11` <chr>
bind_cols(lapply(1:6, function(i) get(paste0("network_silvie0", i)) %>% hits_scores() %>% .$hub %>% enframe() %>% head(20) %>% select(name) %>% setNames(levels(factor(bigrams_fion$age_range))[i]))) #%>% writexl::write_xlsx("silvie_hubs.xlsx")
# A tibble: 20 × 6
   `02;03-02;06` `02;07-02;09` `02;10-03;00` `03;01-03;03` `03;04-03;06`
   <chr>         <chr>         <chr>         <chr>         <chr>        
 1 ich           gar           ich           mache         gar          
 2 this          das           auf           das           das          
 3 noch          hallo         aber          ich           guck         
 4 das           ich           bin           kannst        ich          
 5 nein          derdie        der           der           ist          
 6 piep          ein           habe          ja            und          
 7 da            hast          jetzt         nein          du           
 8 der           nicht         krabbel       sein          nein         
 9 du            noch          bear          stop          noch         
10 ein           und           candle        for           wir          
11 eins          wir           happy         au            bye          
12 haben         wollen        the           doch          isaac        
13 jetzt         foot          time          drauf         birthday     
14 kann          ice           yeah          du            aber         
15 moechte       the           rote          fertig        anfangen     
16 und           washing       auch          fuenf         da           
17 weiss         aber          da            guck          dem          
18 zwei          auf           das           habe          der          
19 a             bin           du            hat           ganz         
20 big           da            geht          immer         haben        
# ℹ 1 more variable: `03;07-03;09` <chr>

Data overview

bind_rows(
  
d_fion %>% group_by(age_range) %>% summarise(
  `Number of utterances` = n(),
  `Number of words` = sum(Wordcount),
  Child = "Fion"
),

d_silvie %>% group_by(age_range) %>% summarise(
  `Number of utterances` = n(),
  `Number of words` = sum(Wordcount),
  Child = "Silvie"
)
) # %>% writexl::write_xlsx("data_overview.xlsx")
# A tibble: 13 × 4
   age_range   `Number of utterances` `Number of words` Child 
   <chr>                        <int>             <dbl> <chr> 
 1 02;03-02;06                  11573             23406 Fion  
 2 02;07-02;09                  11733             30854 Fion  
 3 02;10-03;00                   5572             16548 Fion  
 4 03;01-03;03                   8454             27381 Fion  
 5 03;04-03;06                   4335             14541 Fion  
 6 03;07-03;09                   5075             16321 Fion  
 7 03;10-03;11                   1070              3322 Fion  
 8 02;04-02;07                  10990             31374 Silvie
 9 02;08-02;10                   7879             25511 Silvie
10 02;11-03;01                   7326             25008 Silvie
11 03;02-03;04                   6456             23737 Silvie
12 03;05-03;07                   2750             10585 Silvie
13 03;08-03;09                   2594             10119 Silvie
bind_rows(
  fion_cds %>% group_by(age_range) %>% summarise(
  `Number of utterances` = n(),
  `Number of words` = sum(Wordcount),
  Child = "Fion"
),

silvie_cds %>% group_by(age_range) %>% summarise(
  `Number of utterances` = n(),
  `Number of words` = sum(Wordcount),
  Child = "Silvie"
)
) #%>% writexl::write_xlsx("data_overview_cds.xlsx")
# A tibble: 6 × 4
  age_range   `Number of utterances` `Number of words` Child 
  <chr>                        <int>             <dbl> <chr> 
1 02;03-02;10                  48566            173044 Fion  
2 02;11-03;05                  40048            153388 Fion  
3 03;06-03;11                  17017             63726 Fion  
4 02;04-02;10                  49317            268818 Silvie
5 02;11-03;04                  31504            167093 Silvie
6 03;05-03;09                  13833             72778 Silvie
# only parents:

bind_rows(
  fion_cds %>% filter(Speaker %in% c("MOT", "FAT")) %>%
    group_by(age_range) %>% summarise(
  `Number of utterances` = n(),
  `Number of words` = sum(Wordcount),
  Child = "Fion"
),

silvie_cds %>% filter(Speaker %in% c("MOT", "FAT")) %>%
  group_by(age_range) %>% summarise(
  `Number of utterances` = n(),
  `Number of words` = sum(Wordcount),
  Child = "Silvie"
)
)
# A tibble: 6 × 4
  age_range   `Number of utterances` `Number of words` Child 
  <chr>                        <int>             <dbl> <chr> 
1 02;03-02;10                  39053            141929 Fion  
2 02;11-03;05                  32538            129368 Fion  
3 03;06-03;11                  14684             56672 Fion  
4 02;04-02;10                  44882            246580 Silvie
5 02;11-03;04                  29348            156610 Silvie
6 03;05-03;09                  11506             61605 Silvie

References

Ibbotson, Paul, Vsevolod Salnikov & Richard Walker. 2019. A dynamic network analysis of emergent grammar. First Language 39(6). 652–680. https://doi.org/10.1177/0142723719869562.
Koch, Nikolas, Antje Endesfelder Quick & Stefan Hartmann. 2025. Recycling constructional patterns: The role of chunks in early bilingual acquisition. International Journal of Bilingualism 13670069251346103. https://doi.org/10.1177/13670069251346103.
Quick, Antje Endesfelder, Elena Lieven, Ad Backus & Michael Tomasello. 2018. Constructively combining languages: The use of code-mixing in German-English bilingual child language acquisition. Linguistic Approaches to Bilingualism 8(3). 393–409. https://doi.org/10.1075/lab.17008.qui.